From 125bd061c3240afd92edd5ef8c29a7b3d24a5cd1 Mon Sep 17 00:00:00 2001
From: josel-amd <166385423+josel-amd@users.noreply.github.com>
Date: Tue, 28 May 2024 14:36:24 +0200
Subject: [PATCH 01/89] [mlir][emitc] Support conversion of arith.divsi and
 arith.remsi to EmitC (#93450)

---
 mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp     |  2 ++
 mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
index 0be3d76f556de9..388794ec122d21 100644
--- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
+++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
@@ -394,7 +394,9 @@ void mlir::populateArithToEmitCPatterns(TypeConverter &typeConverter,
     ArithConstantOpConversionPattern,
     ArithOpConversion<arith::AddFOp, emitc::AddOp>,
     ArithOpConversion<arith::DivFOp, emitc::DivOp>,
+    ArithOpConversion<arith::DivSIOp, emitc::DivOp>,
     ArithOpConversion<arith::MulFOp, emitc::MulOp>,
+    ArithOpConversion<arith::RemSIOp, emitc::RemOp>,
     ArithOpConversion<arith::SubFOp, emitc::SubOp>,
     IntegerOpConversion<arith::AddIOp, emitc::AddOp>,
     IntegerOpConversion<arith::MulIOp, emitc::MulOp>,
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
index b453b69a214e86..dac3fd99b607ce 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
@@ -88,6 +88,17 @@ func.func @arith_index(%arg0: index, %arg1: index) {
 
 // -----
 
+// CHECK-LABEL: arith_signed_integer_div_rem
+func.func @arith_signed_integer_div_rem(%arg0: i32, %arg1: i32) {
+  // CHECK: emitc.div %arg0, %arg1 : (i32, i32) -> i32
+  %0 = arith.divsi %arg0, %arg1 : i32
+  // CHECK: emitc.rem %arg0, %arg1 : (i32, i32) -> i32
+  %1 = arith.remsi %arg0, %arg1 : i32
+  return
+}
+
+// -----
+
 func.func @arith_select(%arg0: i1, %arg1: tensor<8xi32>, %arg2: tensor<8xi32>) -> () {
   // CHECK: [[V0:[^ ]*]] = emitc.conditional %arg0, %arg1, %arg2 : tensor<8xi32>
   %0 = arith.select %arg0, %arg1, %arg2 : i1, tensor<8xi32>

From fe5d791517b1cc11bd518f0338516f157fe18661 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 23 May 2024 21:18:17 +0200
Subject: [PATCH 02/89] AMDGPU: Add some multi-use negative tests for
 minimum3/maximum3

---
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 206 ++++++++++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/fminimum3.ll | 206 ++++++++++++++++++++++++++
 2 files changed, 412 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 9690e126dfcfcb..3ec36f03a48aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -3249,3 +3249,209 @@ define double @v_fmaximum3_f64_const1_const2(double %a) {
   %max1 = call double @llvm.maximum.f64(double %max0, double 16.0)
   ret double %max1
 }
+
+define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c) {
+; GFX12-LABEL: v_no_fmaximum3_f32__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f32 v1, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fmaximum3_f32__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  %insert.0 = insertelement <2 x float> poison, float %max0, i32 0
+  %insert.1 = insertelement <2 x float> %insert.0, float %max1, i32 1
+  ret <2 x float> %insert.1
+}
+
+define amdgpu_ps <2 x i32> @s_no_fmaximum3_f32__multi_use(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_no_fmaximum3_f32__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    s_maximum_f32 s1, s0, s2
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_no_fmaximum3_f32__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  %cast0 = bitcast float %max0 to i32
+  %cast1 = bitcast float %max1 to i32
+  %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0)
+  %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
+; GFX12-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f16 v1, v0, v2
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  %insert.0 = insertelement <2 x half> poison, half %max0, i32 0
+  %insert.1 = insertelement <2 x half> %insert.0, half %max1, i32 1
+  ret <2 x half> %insert.1
+}
+
+define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_no_fmaximum3_f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    s_maximum_f16 s1, s0, s2
+; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX12-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_no_fmaximum3_f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  %cast0 = bitcast half %max0 to i16
+  %cast1 = bitcast half %max1 to i16
+  %ext0 = zext i16 %cast0 to i32
+  %ext1 = zext i16 %cast1 to i32
+  %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext0)
+  %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_no_fmaximum3_v2f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fmaximum3_v2f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v1, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.f16(<2 x half> %max0, <2 x half> %c)
+  %concat = shufflevector <2 x half> %max0, <2 x half> %max1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x half> %concat
+}
+
+define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double %c) {
+; GFX12-LABEL: v_no_fmaximum3_f64__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fmaximum3_f64__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  %insert.0 = insertelement <2 x double> poison, double %max0, i32 0
+  %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1
+  ret <2 x double> %insert.1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 7481fff251d895..0e0b73b88d2dca 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -3249,3 +3249,209 @@ define double @v_fminimum3_f64_const1_const2(double %a) {
   %max1 = call double @llvm.minimum.f64(double %max0, double 16.0)
   ret double %max1
 }
+
+define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c) {
+; GFX12-LABEL: v_no_fminimum3_f32__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f32 v1, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fminimum3_f32__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  %insert.0 = insertelement <2 x float> poison, float %max0, i32 0
+  %insert.1 = insertelement <2 x float> %insert.0, float %max1, i32 1
+  ret <2 x float> %insert.1
+}
+
+define amdgpu_ps <2 x i32> @s_no_fminimum3_f32__multi_use(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_no_fminimum3_f32__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    s_minimum_f32 s1, s0, s2
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_no_fminimum3_f32__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  %cast0 = bitcast float %max0 to i32
+  %cast1 = bitcast float %max1 to i32
+  %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0)
+  %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
+; GFX12-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f16 v1, v0, v2
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  %insert.0 = insertelement <2 x half> poison, half %max0, i32 0
+  %insert.1 = insertelement <2 x half> %insert.0, half %max1, i32 1
+  ret <2 x half> %insert.1
+}
+
+define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_no_fminimum3_f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    s_minimum_f16 s1, s0, s2
+; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX12-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_no_fminimum3_f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  %cast0 = bitcast half %max0 to i16
+  %cast1 = bitcast half %max1 to i16
+  %ext0 = zext i16 %cast0 to i32
+  %ext1 = zext i16 %cast1 to i32
+  %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext0)
+  %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_no_fminimum3_v2f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fminimum3_v2f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v1, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.f16(<2 x half> %max0, <2 x half> %c)
+  %concat = shufflevector <2 x half> %max0, <2 x half> %max1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x half> %concat
+}
+
+define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double %c) {
+; GFX12-LABEL: v_no_fminimum3_f64__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fminimum3_f64__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  %insert.0 = insertelement <2 x double> poison, double %max0, i32 0
+  %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1
+  ret <2 x double> %insert.1
+}

From 1da52caf2946e56f69eae75a60088a54edda1db5 Mon Sep 17 00:00:00 2001
From: Kelvin Li <kkwli@users.noreply.github.com>
Date: Tue, 28 May 2024 08:50:55 -0400
Subject: [PATCH 03/89] [flang] Fix typos PPC intrinsics tests (NFC) (#92943)

---
 flang/test/Lower/PowerPC/ppc-vec-load.f90     | 119 +++++++--------
 .../Lower/PowerPC/ppc-vec-shift-be-le.f90     | 140 +++++++++---------
 2 files changed, 130 insertions(+), 129 deletions(-)

diff --git a/flang/test/Lower/PowerPC/ppc-vec-load.f90 b/flang/test/Lower/PowerPC/ppc-vec-load.f90
index 4d51512df0f7b4..a81ed055ce08c8 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-load.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-load.f90
@@ -1,12 +1,13 @@
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s
+! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE","LLVM" %s
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr9 -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR_P9","LLVM" %s
+! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE","LLVM" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 !----------------------
 ! vec_ld
 !----------------------
 
-! CHECK-LABEL: @vec_ld_testi8
+! LLVM-LABEL: @vec_ld_testi8
 subroutine vec_ld_testi8(arg1, arg2, res)
   integer(1) :: arg1
   vector(integer(1)) :: arg2, res
@@ -19,7 +20,7 @@ subroutine vec_ld_testi8(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testi8
 
-! CHECK-LABEL: @vec_ld_testi16
+! LLVM-LABEL: @vec_ld_testi16
 subroutine vec_ld_testi16(arg1, arg2, res)
   integer(2) :: arg1
   vector(integer(2)) :: arg2, res
@@ -32,7 +33,7 @@ subroutine vec_ld_testi16(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testi16
 
-! CHECK-LABEL: @vec_ld_testi32
+! LLVM-LABEL: @vec_ld_testi32
 subroutine vec_ld_testi32(arg1, arg2, res)
   integer(4) :: arg1
   vector(integer(4)) :: arg2, res
@@ -44,7 +45,7 @@ subroutine vec_ld_testi32(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testi32
 
-! CHECK-LABEL: @vec_ld_testf32
+! LLVM-LABEL: @vec_ld_testf32
 subroutine vec_ld_testf32(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(4)) :: arg2, res
@@ -58,7 +59,7 @@ subroutine vec_ld_testf32(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testf32
 
-! CHECK-LABEL: @vec_ld_testu32
+! LLVM-LABEL: @vec_ld_testu32
 subroutine vec_ld_testu32(arg1, arg2, res)
   integer(1) :: arg1
   vector(unsigned(4)) :: arg2, res
@@ -70,7 +71,7 @@ subroutine vec_ld_testu32(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_ld_testu32
 
-! CHECK-LABEL: @vec_ld_testi32a
+! LLVM-LABEL: @vec_ld_testi32a
 subroutine vec_ld_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(10)
@@ -83,7 +84,7 @@ subroutine vec_ld_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_ld_testi32a
 
-! CHECK-LABEL: @vec_ld_testf32av
+! LLVM-LABEL: @vec_ld_testf32av
 subroutine vec_ld_testf32av(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(4)) :: arg2(2, 4, 8)
@@ -98,7 +99,7 @@ subroutine vec_ld_testf32av(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testf32av
 
-! CHECK-LABEL: @vec_ld_testi32s
+! LLVM-LABEL: @vec_ld_testi32s
 subroutine vec_ld_testi32s(arg1, arg2, res)
   integer(4) :: arg1
   real(4) :: arg2
@@ -116,7 +117,7 @@ end subroutine vec_ld_testi32s
 ! vec_lde
 !----------------------
 
-! CHECK-LABEL: @vec_lde_testi8s
+! LLVM-LABEL: @vec_lde_testi8s
 subroutine vec_lde_testi8s(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2
@@ -129,7 +130,7 @@ subroutine vec_lde_testi8s(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[call]], ptr %2, align 16
 end subroutine vec_lde_testi8s
 
-! CHECK-LABEL: @vec_lde_testi16a
+! LLVM-LABEL: @vec_lde_testi16a
 subroutine vec_lde_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(2, 4, 8)
@@ -142,7 +143,7 @@ subroutine vec_lde_testi16a(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[call]], ptr %2, align 16
 end subroutine vec_lde_testi16a
 
-! CHECK-LABEL: @vec_lde_testi32a
+! LLVM-LABEL: @vec_lde_testi32a
 subroutine vec_lde_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(4)
@@ -155,7 +156,7 @@ subroutine vec_lde_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_lde_testi32a
 
-! CHECK-LABEL: @vec_lde_testf32a
+! LLVM-LABEL: @vec_lde_testf32a
 subroutine vec_lde_testf32a(arg1, arg2, res)
   integer(8) :: arg1
   real(4) :: arg2(4)
@@ -173,7 +174,7 @@ end subroutine vec_lde_testf32a
 ! vec_ldl
 !----------------------
 
-! CHECK-LABEL: @vec_ldl_testi8
+! LLVM-LABEL: @vec_ldl_testi8
 subroutine vec_ldl_testi8(arg1, arg2, res)
   integer(1) :: arg1
   vector(integer(1)) :: arg2, res
@@ -186,7 +187,7 @@ subroutine vec_ldl_testi8(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testi8
 
-! CHECK-LABEL: @vec_ldl_testi16
+! LLVM-LABEL: @vec_ldl_testi16
 subroutine vec_ldl_testi16(arg1, arg2, res)
   integer(2) :: arg1
   vector(integer(2)) :: arg2, res
@@ -199,7 +200,7 @@ subroutine vec_ldl_testi16(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testi16
 
-! CHECK-LABEL: @vec_ldl_testi32
+! LLVM-LABEL: @vec_ldl_testi32
 subroutine vec_ldl_testi32(arg1, arg2, res)
   integer(4) :: arg1
   vector(integer(4)) :: arg2, res
@@ -211,7 +212,7 @@ subroutine vec_ldl_testi32(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testi32
 
-! CHECK-LABEL: @vec_ldl_testf32
+! LLVM-LABEL: @vec_ldl_testf32
 subroutine vec_ldl_testf32(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(4)) :: arg2, res
@@ -225,7 +226,7 @@ subroutine vec_ldl_testf32(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testf32
 
-! CHECK-LABEL: @vec_ldl_testu32
+! LLVM-LABEL: @vec_ldl_testu32
 subroutine vec_ldl_testu32(arg1, arg2, res)
   integer(1) :: arg1
   vector(unsigned(4)) :: arg2, res
@@ -237,7 +238,7 @@ subroutine vec_ldl_testu32(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_ldl_testu32
 
-! CHECK-LABEL: @vec_ldl_testi32a
+! LLVM-LABEL: @vec_ldl_testi32a
 subroutine vec_ldl_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(10)
@@ -250,7 +251,7 @@ subroutine vec_ldl_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_ldl_testi32a
 
-! CHECK-LABEL: @vec_ldl_testf32av
+! LLVM-LABEL: @vec_ldl_testf32av
 subroutine vec_ldl_testf32av(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(4)) :: arg2(2, 4, 8)
@@ -264,7 +265,7 @@ subroutine vec_ldl_testf32av(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testf32av
 
-! CHECK-LABEL: @vec_ldl_testi32s
+! LLVM-LABEL: @vec_ldl_testi32s
 subroutine vec_ldl_testi32s(arg1, arg2, res)
   integer(4) :: arg1
   real(4) :: arg2
@@ -282,7 +283,7 @@ end subroutine vec_ldl_testi32s
 ! vec_lvsl
 !----------------------
 
-! CHECK-LABEL: @vec_lvsl_testi8s
+! LLVM-LABEL: @vec_lvsl_testi8s
 subroutine vec_lvsl_testi8s(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2
@@ -300,7 +301,7 @@ subroutine vec_lvsl_testi8s(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_lvsl_testi8s
 
-! CHECK-LABEL: @vec_lvsl_testi16a
+! LLVM-LABEL: @vec_lvsl_testi16a
 subroutine vec_lvsl_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(4)
@@ -318,7 +319,7 @@ subroutine vec_lvsl_testi16a(arg1, arg2, res)
 ! LLVMIR-BE:  store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_lvsl_testi16a
 
-! CHECK-LABEL: @vec_lvsl_testi32a
+! LLVM-LABEL: @vec_lvsl_testi32a
 subroutine vec_lvsl_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 3, 4)
@@ -336,7 +337,7 @@ subroutine vec_lvsl_testi32a(arg1, arg2, res)
 ! LLVMIR-BE:  store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_lvsl_testi32a
 
-! CHECK-LABEL: @vec_lvsl_testf32a
+! LLVM-LABEL: @vec_lvsl_testf32a
 subroutine vec_lvsl_testf32a(arg1, arg2, res)
   integer(8) :: arg1
   real(4) :: arg2(4)
@@ -357,7 +358,7 @@ end subroutine vec_lvsl_testf32a
 ! vec_lvsr
 !----------------------
 
-! CHECK-LABEL: @vec_lvsr_testi8s
+! LLVM-LABEL: @vec_lvsr_testi8s
 subroutine vec_lvsr_testi8s(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2
@@ -375,7 +376,7 @@ subroutine vec_lvsr_testi8s(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[addr]], ptr %2, align 16
 end subroutine vec_lvsr_testi8s
 
-! CHECK-LABEL: @vec_lvsr_testi16a
+! LLVM-LABEL: @vec_lvsr_testi16a
 subroutine vec_lvsr_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(4)
@@ -393,7 +394,7 @@ subroutine vec_lvsr_testi16a(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[addr]], ptr %2, align 16
 end subroutine vec_lvsr_testi16a
 
-! CHECK-LABEL: @vec_lvsr_testi32a
+! LLVM-LABEL: @vec_lvsr_testi32a
 subroutine vec_lvsr_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 3, 4)
@@ -411,7 +412,7 @@ subroutine vec_lvsr_testi32a(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[addr]], ptr %2, align 16
 end subroutine vec_lvsr_testi32a
 
-! CHECK-LABEL: @vec_lvsr_testf32a
+! LLVM-LABEL: @vec_lvsr_testf32a
 subroutine vec_lvsr_testf32a(arg1, arg2, res)
   integer(8) :: arg1
   real(4) :: arg2(4)
@@ -432,7 +433,7 @@ end subroutine vec_lvsr_testf32a
 ! vec_lxv
 !----------------------
 
-! CHECK-LABEL: @vec_lxv_testi8a
+! LLVM-LABEL: @vec_lxv_testi8a
 subroutine vec_lxv_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2(4)
@@ -445,7 +446,7 @@ subroutine vec_lxv_testi8a(arg1, arg2, res)
 ! LLVMIR_P9: store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_lxv_testi8a
 
-! CHECK-LABEL: @vec_lxv_testi16a
+! LLVM-LABEL: @vec_lxv_testi16a
 subroutine vec_lxv_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(2, 4, 8)
@@ -458,7 +459,7 @@ subroutine vec_lxv_testi16a(arg1, arg2, res)
 ! LLVMIR_P9: store <8 x i16> %[[ld]], ptr %2, align 16
 end subroutine vec_lxv_testi16a
 
-! CHECK-LABEL: @vec_lxv_testi32a
+! LLVM-LABEL: @vec_lxv_testi32a
 subroutine vec_lxv_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 4, 8)
@@ -471,7 +472,7 @@ subroutine vec_lxv_testi32a(arg1, arg2, res)
 ! LLVMIR_P9: store <4 x i32> %[[ld]], ptr %2, align 16
 end subroutine vec_lxv_testi32a
 
-! CHECK-LABEL: @vec_lxv_testf32a
+! LLVM-LABEL: @vec_lxv_testf32a
 subroutine vec_lxv_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   real(4) :: arg2(4)
@@ -484,7 +485,7 @@ subroutine vec_lxv_testf32a(arg1, arg2, res)
 ! LLVMIR_P9: store <4 x float> %[[ld]], ptr %2, align 16
 end subroutine vec_lxv_testf32a
 
-! CHECK-LABEL: @vec_lxv_testf64a
+! LLVM-LABEL: @vec_lxv_testf64a
 subroutine vec_lxv_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   real(8) :: arg2(4)
@@ -501,7 +502,7 @@ end subroutine vec_lxv_testf64a
 ! vec_xld2
 !----------------------
 
-! CHECK-LABEL: @vec_xld2_testi8a
+! LLVM-LABEL: @vec_xld2_testi8a
 subroutine vec_xld2_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   vector(integer(1)) :: arg2(4)
@@ -515,7 +516,7 @@ subroutine vec_xld2_testi8a(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testi8a
 
-! CHECK-LABEL: @vec_xld2_testi16
+! LLVM-LABEL: @vec_xld2_testi16
 subroutine vec_xld2_testi16(arg1, arg2, res)
   integer :: arg1
   vector(integer(2)) :: arg2
@@ -529,7 +530,7 @@ subroutine vec_xld2_testi16(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testi16
 
-! CHECK-LABEL: @vec_xld2_testi32a
+! LLVM-LABEL: @vec_xld2_testi32a
 subroutine vec_xld2_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   vector(integer(4)) :: arg2(41)
@@ -543,7 +544,7 @@ subroutine vec_xld2_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testi32a
 
-! CHECK-LABEL: @vec_xld2_testi64a
+! LLVM-LABEL: @vec_xld2_testi64a
 subroutine vec_xld2_testi64a(arg1, arg2, res)
   integer(8) :: arg1
   vector(integer(8)) :: arg2(4)
@@ -557,7 +558,7 @@ subroutine vec_xld2_testi64a(arg1, arg2, res)
 ! LLVMIR: store <2 x i64> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testi64a
 
-! CHECK-LABEL: @vec_xld2_testf32a
+! LLVM-LABEL: @vec_xld2_testf32a
 subroutine vec_xld2_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   vector(real(4)) :: arg2(4)
@@ -571,7 +572,7 @@ subroutine vec_xld2_testf32a(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testf32a
 
-! CHECK-LABEL: @vec_xld2_testf64a
+! LLVM-LABEL: @vec_xld2_testf64a
 subroutine vec_xld2_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(8)) :: arg2(4)
@@ -588,7 +589,7 @@ end subroutine vec_xld2_testf64a
 ! vec_xl
 !----------------------
 
-! CHECK-LABEL: @vec_xl_testi8a
+! LLVM-LABEL: @vec_xl_testi8a
 subroutine vec_xl_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2(4)
@@ -601,7 +602,7 @@ subroutine vec_xl_testi8a(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_testi8a
 
-! CHECK-LABEL: @vec_xl_testi16a
+! LLVM-LABEL: @vec_xl_testi16a
 subroutine vec_xl_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(2, 4, 8)
@@ -614,7 +615,7 @@ subroutine vec_xl_testi16a(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_testi16a
 
-! CHECK-LABEL: @vec_xl_testi32a
+! LLVM-LABEL: @vec_xl_testi32a
 subroutine vec_xl_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 4, 8)
@@ -627,7 +628,7 @@ subroutine vec_xl_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_testi32a
 
-! CHECK-LABEL: @vec_xl_testi64a
+! LLVM-LABEL: @vec_xl_testi64a
 subroutine vec_xl_testi64a(arg1, arg2, res)
   integer(8) :: arg1
   integer(8) :: arg2(2, 4, 8)
@@ -641,7 +642,7 @@ subroutine vec_xl_testi64a(arg1, arg2, res)
 ! LLVMIR: store <2 x i64> %[[bc]], ptr %2, align 16
 end subroutine vec_xl_testi64a
 
-! CHECK-LABEL: @vec_xl_testf32a
+! LLVM-LABEL: @vec_xl_testf32a
 subroutine vec_xl_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   real(4) :: arg2(4)
@@ -655,7 +656,7 @@ subroutine vec_xl_testf32a(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_xl_testf32a
 
-! CHECK-LABEL: @vec_xl_testf64a
+! LLVM-LABEL: @vec_xl_testf64a
 subroutine vec_xl_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   real(8) :: arg2
@@ -672,7 +673,7 @@ end subroutine vec_xl_testf64a
 ! vec_xlds
 !----------------------
 
-! CHECK-LABEL: @vec_xlds_testi64a
+! LLVM-LABEL: @vec_xlds_testi64a
 subroutine vec_xlds_testi64a(arg1, arg2, res)
   integer(8) :: arg1
   vector(integer(8)) :: arg2(4)
@@ -687,7 +688,7 @@ subroutine vec_xlds_testi64a(arg1, arg2, res)
 ! LLVMIR: store <2 x i64> %[[shfl]], ptr %2, align 16
 end subroutine vec_xlds_testi64a
 
-! CHECK-LABEL: @vec_xlds_testf64a
+! LLVM-LABEL: @vec_xlds_testf64a
 subroutine vec_xlds_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(8)) :: arg2(4)
@@ -707,7 +708,7 @@ end subroutine vec_xlds_testf64a
 ! vec_xl_be
 !----------------------
 
-! CHECK-LABEL: @vec_xl_be_testi8a
+! LLVM-LABEL: @vec_xl_be_testi8a
 subroutine vec_xl_be_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2(2, 4, 8)
@@ -722,7 +723,7 @@ subroutine vec_xl_be_testi8a(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testi8a
 
-! CHECK-LABEL: @vec_xl_be_testi16a
+! LLVM-LABEL: @vec_xl_be_testi16a
 subroutine vec_xl_be_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(2, 4, 8)
@@ -737,7 +738,7 @@ subroutine vec_xl_be_testi16a(arg1, arg2, res)
 ! LLVMIR-BE: store <8 x i16> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testi16a
 
-! CHECK-LABEL: @vec_xl_be_testi32a
+! LLVM-LABEL: @vec_xl_be_testi32a
 subroutine vec_xl_be_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 4, 8)
@@ -752,7 +753,7 @@ subroutine vec_xl_be_testi32a(arg1, arg2, res)
 ! LLVMIR-BE:  store <4 x i32> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testi32a
 
-! CHECK-LABEL: @vec_xl_be_testi64a
+! LLVM-LABEL: @vec_xl_be_testi64a
 subroutine vec_xl_be_testi64a(arg1, arg2, res)
   integer(8) :: arg1
   integer(8) :: arg2(2, 4, 8)
@@ -767,7 +768,7 @@ subroutine vec_xl_be_testi64a(arg1, arg2, res)
 ! LLVMIR-BE:  store <2 x i64> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testi64a
 
-! CHECK-LABEL: @vec_xl_be_testf32a
+! LLVM-LABEL: @vec_xl_be_testf32a
 subroutine vec_xl_be_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   real(4) :: arg2(4)
@@ -782,7 +783,7 @@ subroutine vec_xl_be_testf32a(arg1, arg2, res)
 ! LLVMIR-BE:  store <4 x float> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testf32a
 
-! CHECK-LABEL: @vec_xl_be_testf64a
+! LLVM-LABEL: @vec_xl_be_testf64a
 subroutine vec_xl_be_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   real(8) :: arg2(7)
@@ -801,7 +802,7 @@ end subroutine vec_xl_be_testf64a
 ! vec_xlw4
 !----------------------
 
-! CHECK-LABEL: @vec_xlw4_testi8a
+! LLVM-LABEL: @vec_xlw4_testi8a
 subroutine vec_xlw4_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   vector(integer(1)) :: arg2(2, 4, 8)
@@ -815,7 +816,7 @@ subroutine vec_xlw4_testi8a(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[res]], ptr %2, align 16
 end subroutine vec_xlw4_testi8a
 
-! CHECK-LABEL: @vec_xlw4_testi16a
+! LLVM-LABEL: @vec_xlw4_testi16a
 subroutine vec_xlw4_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   vector(integer(2)) :: arg2(2, 4, 8)
@@ -829,7 +830,7 @@ subroutine vec_xlw4_testi16a(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[res]], ptr %2, align 16
 end subroutine vec_xlw4_testi16a
 
-! CHECK-LABEL: @vec_xlw4_testu32a
+! LLVM-LABEL: @vec_xlw4_testu32a
 subroutine vec_xlw4_testu32a(arg1, arg2, res)
   integer(4) :: arg1
   vector(unsigned(4)) :: arg2(2, 4, 8)
@@ -842,7 +843,7 @@ subroutine vec_xlw4_testu32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[ld]], ptr %2, align 16
 end subroutine vec_xlw4_testu32a
 
-! CHECK-LABEL: @vec_xlw4_testf32a
+! LLVM-LABEL: @vec_xlw4_testf32a
 subroutine vec_xlw4_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   vector(real(4)) :: arg2(4)
diff --git a/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90 b/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90
index bd83f28b4eeb52..6c4f202f89a456 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90
@@ -1,13 +1,13 @@
-! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="CHECK" %s
+! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR","LLVM" %s
 !
-! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64-unknown-aix -o - | FileCheck --check-prefixes="BE-LLVMIR" %s
+! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64-unknown-aix -o - | FileCheck --check-prefixes="BE-LLVMIR","LLVM" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 !----------------------
 ! vec_sld
 !----------------------
 
-! CHECK-LABEL: vec_sld_test_i1i1
+! LLVM-LABEL: vec_sld_test_i1i1
 subroutine vec_sld_test_i1i1(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -23,7 +23,7 @@ subroutine vec_sld_test_i1i1(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i1i1
 
-! CHECK-LABEL: vec_sld_test_i1i2
+! LLVM-LABEL: vec_sld_test_i1i2
 subroutine vec_sld_test_i1i2(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -39,7 +39,7 @@ subroutine vec_sld_test_i1i2(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i1i2
 
-! CHECK-LABEL: vec_sld_test_i1i4
+! LLVM-LABEL: vec_sld_test_i1i4
 subroutine vec_sld_test_i1i4(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -55,7 +55,7 @@ subroutine vec_sld_test_i1i4(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i1i4
 
-! CHECK-LABEL: vec_sld_test_i1i8
+! LLVM-LABEL: vec_sld_test_i1i8
 subroutine vec_sld_test_i1i8(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_8)
@@ -71,7 +71,7 @@ subroutine vec_sld_test_i1i8(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i1i8
 
-! CHECK-LABEL: vec_sld_test_i2i1
+! LLVM-LABEL: vec_sld_test_i2i1
 subroutine vec_sld_test_i2i1(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -93,7 +93,7 @@ subroutine vec_sld_test_i2i1(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i2i1
 
-! CHECK-LABEL: vec_sld_test_i2i2
+! LLVM-LABEL: vec_sld_test_i2i2
 subroutine vec_sld_test_i2i2(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 8_2)
@@ -115,7 +115,7 @@ subroutine vec_sld_test_i2i2(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i2i2
 
-! CHECK-LABEL: vec_sld_test_i2i4
+! LLVM-LABEL: vec_sld_test_i2i4
 subroutine vec_sld_test_i2i4(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -137,7 +137,7 @@ subroutine vec_sld_test_i2i4(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i2i4
 
-! CHECK-LABEL: vec_sld_test_i2i8
+! LLVM-LABEL: vec_sld_test_i2i8
 subroutine vec_sld_test_i2i8(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 11_8)
@@ -159,7 +159,7 @@ subroutine vec_sld_test_i2i8(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i2i8
 
-! CHECK-LABEL: vec_sld_test_i4i1
+! LLVM-LABEL: vec_sld_test_i4i1
 subroutine vec_sld_test_i4i1(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -181,7 +181,7 @@ subroutine vec_sld_test_i4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i4i1
 
-! CHECK-LABEL: vec_sld_test_i4i2
+! LLVM-LABEL: vec_sld_test_i4i2
 subroutine vec_sld_test_i4i2(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -203,7 +203,7 @@ subroutine vec_sld_test_i4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i4i2
 
-! CHECK-LABEL: vec_sld_test_i4i4
+! LLVM-LABEL: vec_sld_test_i4i4
 subroutine vec_sld_test_i4i4(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -225,7 +225,7 @@ subroutine vec_sld_test_i4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i4i4
 
-! CHECK-LABEL: vec_sld_test_i4i8
+! LLVM-LABEL: vec_sld_test_i4i8
 subroutine vec_sld_test_i4i8(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_8)
@@ -247,7 +247,7 @@ subroutine vec_sld_test_i4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i4i8
 
-! CHECK-LABEL: vec_sld_test_u1i1
+! LLVM-LABEL: vec_sld_test_u1i1
 subroutine vec_sld_test_u1i1(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -263,7 +263,7 @@ subroutine vec_sld_test_u1i1(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u1i1
 
-! CHECK-LABEL: vec_sld_test_u1i2
+! LLVM-LABEL: vec_sld_test_u1i2
 subroutine vec_sld_test_u1i2(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -279,7 +279,7 @@ subroutine vec_sld_test_u1i2(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u1i2
 
-! CHECK-LABEL: vec_sld_test_u1i4
+! LLVM-LABEL: vec_sld_test_u1i4
 subroutine vec_sld_test_u1i4(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -295,7 +295,7 @@ subroutine vec_sld_test_u1i4(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u1i4
 
-! CHECK-LABEL: vec_sld_test_u1i8
+! LLVM-LABEL: vec_sld_test_u1i8
 subroutine vec_sld_test_u1i8(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -311,7 +311,7 @@ subroutine vec_sld_test_u1i8(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u1i8
 
-! CHECK-LABEL: vec_sld_test_u2i1
+! LLVM-LABEL: vec_sld_test_u2i1
 subroutine vec_sld_test_u2i1(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -333,7 +333,7 @@ subroutine vec_sld_test_u2i1(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u2i1
 
-! CHECK-LABEL: vec_sld_test_u2i2
+! LLVM-LABEL: vec_sld_test_u2i2
 subroutine vec_sld_test_u2i2(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -355,7 +355,7 @@ subroutine vec_sld_test_u2i2(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u2i2
 
-! CHECK-LABEL: vec_sld_test_u2i4
+! LLVM-LABEL: vec_sld_test_u2i4
 subroutine vec_sld_test_u2i4(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -377,7 +377,7 @@ subroutine vec_sld_test_u2i4(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u2i4
 
-! CHECK-LABEL: vec_sld_test_u2i8
+! LLVM-LABEL: vec_sld_test_u2i8
 subroutine vec_sld_test_u2i8(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_8)
@@ -399,7 +399,7 @@ subroutine vec_sld_test_u2i8(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u2i8
 
-! CHECK-LABEL: vec_sld_test_u4i1
+! LLVM-LABEL: vec_sld_test_u4i1
 subroutine vec_sld_test_u4i1(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -421,7 +421,7 @@ subroutine vec_sld_test_u4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u4i1
 
-! CHECK-LABEL: vec_sld_test_u4i2
+! LLVM-LABEL: vec_sld_test_u4i2
 subroutine vec_sld_test_u4i2(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -443,7 +443,7 @@ subroutine vec_sld_test_u4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u4i2
 
-! CHECK-LABEL: vec_sld_test_u4i4
+! LLVM-LABEL: vec_sld_test_u4i4
 subroutine vec_sld_test_u4i4(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -465,7 +465,7 @@ subroutine vec_sld_test_u4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u4i4
 
-! CHECK-LABEL: vec_sld_test_u4i8
+! LLVM-LABEL: vec_sld_test_u4i8
 subroutine vec_sld_test_u4i8(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_8)
@@ -487,7 +487,7 @@ subroutine vec_sld_test_u4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u4i8
 
-! CHECK-LABEL: vec_sld_test_r4i1
+! LLVM-LABEL: vec_sld_test_r4i1
 subroutine vec_sld_test_r4i1(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -509,7 +509,7 @@ subroutine vec_sld_test_r4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_r4i1
 
-! CHECK-LABEL: vec_sld_test_r4i2
+! LLVM-LABEL: vec_sld_test_r4i2
 subroutine vec_sld_test_r4i2(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -531,7 +531,7 @@ subroutine vec_sld_test_r4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_r4i2
 
-! CHECK-LABEL: vec_sld_test_r4i4
+! LLVM-LABEL: vec_sld_test_r4i4
 subroutine vec_sld_test_r4i4(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -553,7 +553,7 @@ subroutine vec_sld_test_r4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_r4i4
 
-! CHECK-LABEL: vec_sld_test_r4i8
+! LLVM-LABEL: vec_sld_test_r4i8
 subroutine vec_sld_test_r4i8(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 1_8)
@@ -578,7 +578,7 @@ end subroutine vec_sld_test_r4i8
 !----------------------
 ! vec_sldw
 !----------------------
-! CHECK-LABEL: vec_sldw_test_i1i1
+! LLVM-LABEL: vec_sldw_test_i1i1
 subroutine vec_sldw_test_i1i1(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -594,7 +594,7 @@ subroutine vec_sldw_test_i1i1(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i1i1
 
-! CHECK-LABEL: vec_sldw_test_i1i2
+! LLVM-LABEL: vec_sldw_test_i1i2
 subroutine vec_sldw_test_i1i2(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -610,7 +610,7 @@ subroutine vec_sldw_test_i1i2(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i1i2
 
-! CHECK-LABEL: vec_sldw_test_i1i4
+! LLVM-LABEL: vec_sldw_test_i1i4
 subroutine vec_sldw_test_i1i4(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -626,7 +626,7 @@ subroutine vec_sldw_test_i1i4(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i1i4
 
-! CHECK-LABEL: vec_sldw_test_i1i8
+! LLVM-LABEL: vec_sldw_test_i1i8
 subroutine vec_sldw_test_i1i8(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -642,7 +642,7 @@ subroutine vec_sldw_test_i1i8(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i1i8
 
-! CHECK-LABEL: vec_sldw_test_i2i1
+! LLVM-LABEL: vec_sldw_test_i2i1
 subroutine vec_sldw_test_i2i1(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -664,7 +664,7 @@ subroutine vec_sldw_test_i2i1(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i2i1
 
-! CHECK-LABEL: vec_sldw_test_i2i2
+! LLVM-LABEL: vec_sldw_test_i2i2
 subroutine vec_sldw_test_i2i2(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -686,7 +686,7 @@ subroutine vec_sldw_test_i2i2(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i2i2
 
-! CHECK-LABEL: vec_sldw_test_i2i4
+! LLVM-LABEL: vec_sldw_test_i2i4
 subroutine vec_sldw_test_i2i4(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -708,7 +708,7 @@ subroutine vec_sldw_test_i2i4(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i2i4
 
-! CHECK-LABEL: vec_sldw_test_i2i8
+! LLVM-LABEL: vec_sldw_test_i2i8
 subroutine vec_sldw_test_i2i8(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -730,7 +730,7 @@ subroutine vec_sldw_test_i2i8(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i2i8
 
-! CHECK-LABEL: vec_sldw_test_i4i1
+! LLVM-LABEL: vec_sldw_test_i4i1
 subroutine vec_sldw_test_i4i1(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -752,7 +752,7 @@ subroutine vec_sldw_test_i4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i4i1
 
-! CHECK-LABEL: vec_sldw_test_i4i2
+! LLVM-LABEL: vec_sldw_test_i4i2
 subroutine vec_sldw_test_i4i2(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -774,7 +774,7 @@ subroutine vec_sldw_test_i4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i4i2
 
-! CHECK-LABEL: vec_sldw_test_i4i4
+! LLVM-LABEL: vec_sldw_test_i4i4
 subroutine vec_sldw_test_i4i4(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -796,7 +796,7 @@ subroutine vec_sldw_test_i4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i4i4
 
-! CHECK-LABEL: vec_sldw_test_i4i8
+! LLVM-LABEL: vec_sldw_test_i4i8
 subroutine vec_sldw_test_i4i8(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -818,7 +818,7 @@ subroutine vec_sldw_test_i4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i4i8
 
-! CHECK-LABEL: vec_sldw_test_i8i1
+! LLVM-LABEL: vec_sldw_test_i8i1
 subroutine vec_sldw_test_i8i1(arg1, arg2)
   vector(integer(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -840,7 +840,7 @@ subroutine vec_sldw_test_i8i1(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i8i1
 
-! CHECK-LABEL: vec_sldw_test_i8i2
+! LLVM-LABEL: vec_sldw_test_i8i2
 subroutine vec_sldw_test_i8i2(arg1, arg2)
   vector(integer(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -862,7 +862,7 @@ subroutine vec_sldw_test_i8i2(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i8i2
 
-! CHECK-LABEL: vec_sldw_test_i8i4
+! LLVM-LABEL: vec_sldw_test_i8i4
 subroutine vec_sldw_test_i8i4(arg1, arg2)
   vector(integer(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -884,7 +884,7 @@ subroutine vec_sldw_test_i8i4(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i8i4
 
-! CHECK-LABEL: vec_sldw_test_i8i8
+! LLVM-LABEL: vec_sldw_test_i8i8
 subroutine vec_sldw_test_i8i8(arg1, arg2)
   vector(integer(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -907,7 +907,7 @@ subroutine vec_sldw_test_i8i8(arg1, arg2)
 
 end subroutine vec_sldw_test_i8i8
 
-! CHECK-LABEL: vec_sldw_test_u1i1
+! LLVM-LABEL: vec_sldw_test_u1i1
 subroutine vec_sldw_test_u1i1(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -923,7 +923,7 @@ subroutine vec_sldw_test_u1i1(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u1i1
 
-! CHECK-LABEL: vec_sldw_test_u1i2
+! LLVM-LABEL: vec_sldw_test_u1i2
 subroutine vec_sldw_test_u1i2(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -939,7 +939,7 @@ subroutine vec_sldw_test_u1i2(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u1i2
 
-! CHECK-LABEL: vec_sldw_test_u1i4
+! LLVM-LABEL: vec_sldw_test_u1i4
 subroutine vec_sldw_test_u1i4(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -955,7 +955,7 @@ subroutine vec_sldw_test_u1i4(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u1i4
 
-! CHECK-LABEL: vec_sldw_test_u1i8
+! LLVM-LABEL: vec_sldw_test_u1i8
 subroutine vec_sldw_test_u1i8(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -971,7 +971,7 @@ subroutine vec_sldw_test_u1i8(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u1i8
 
-! CHECK-LABEL: vec_sldw_test_u2i1
+! LLVM-LABEL: vec_sldw_test_u2i1
 subroutine vec_sldw_test_u2i1(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -993,7 +993,7 @@ subroutine vec_sldw_test_u2i1(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u2i1
 
-! CHECK-LABEL: vec_sldw_test_u2i2
+! LLVM-LABEL: vec_sldw_test_u2i2
 subroutine vec_sldw_test_u2i2(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1015,7 +1015,7 @@ subroutine vec_sldw_test_u2i2(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u2i2
 
-! CHECK-LABEL: vec_sldw_test_u2i4
+! LLVM-LABEL: vec_sldw_test_u2i4
 subroutine vec_sldw_test_u2i4(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1037,7 +1037,7 @@ subroutine vec_sldw_test_u2i4(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u2i4
 
-! CHECK-LABEL: vec_sldw_test_u2i8
+! LLVM-LABEL: vec_sldw_test_u2i8
 subroutine vec_sldw_test_u2i8(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -1059,7 +1059,7 @@ subroutine vec_sldw_test_u2i8(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u2i8
 
-! CHECK-LABEL: vec_sldw_test_u4i1
+! LLVM-LABEL: vec_sldw_test_u4i1
 subroutine vec_sldw_test_u4i1(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -1081,7 +1081,7 @@ subroutine vec_sldw_test_u4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u4i1
 
-! CHECK-LABEL: vec_sldw_test_u4i2
+! LLVM-LABEL: vec_sldw_test_u4i2
 subroutine vec_sldw_test_u4i2(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1103,7 +1103,7 @@ subroutine vec_sldw_test_u4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u4i2
 
-! CHECK-LABEL: vec_sldw_test_u4i4
+! LLVM-LABEL: vec_sldw_test_u4i4
 subroutine vec_sldw_test_u4i4(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1125,7 +1125,7 @@ subroutine vec_sldw_test_u4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u4i4
 
-! CHECK-LABEL: vec_sldw_test_u4i8
+! LLVM-LABEL: vec_sldw_test_u4i8
 subroutine vec_sldw_test_u4i8(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -1147,7 +1147,7 @@ subroutine vec_sldw_test_u4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u4i8
 
-! CHECK-LABEL: vec_sldw_test_u8i1
+! LLVM-LABEL: vec_sldw_test_u8i1
 subroutine vec_sldw_test_u8i1(arg1, arg2)
   vector(unsigned(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -1169,7 +1169,7 @@ subroutine vec_sldw_test_u8i1(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u8i1
 
-! CHECK-LABEL: vec_sldw_test_u8i2
+! LLVM-LABEL: vec_sldw_test_u8i2
 subroutine vec_sldw_test_u8i2(arg1, arg2)
   vector(unsigned(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1191,7 +1191,7 @@ subroutine vec_sldw_test_u8i2(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u8i2
 
-! CHECK-LABEL: vec_sldw_test_u8i4
+! LLVM-LABEL: vec_sldw_test_u8i4
 subroutine vec_sldw_test_u8i4(arg1, arg2)
   vector(unsigned(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1213,7 +1213,7 @@ subroutine vec_sldw_test_u8i4(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u8i4
 
-! CHECK-LABEL: vec_sldw_test_u8i8
+! LLVM-LABEL: vec_sldw_test_u8i8
 subroutine vec_sldw_test_u8i8(arg1, arg2)
   vector(unsigned(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -1235,7 +1235,7 @@ subroutine vec_sldw_test_u8i8(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u8i8
 
-! CHECK-LABEL: vec_sldw_test_r4i1
+! LLVM-LABEL: vec_sldw_test_r4i1
 subroutine vec_sldw_test_r4i1(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -1257,7 +1257,7 @@ subroutine vec_sldw_test_r4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r4i1
 
-! CHECK-LABEL: vec_sldw_test_r4i2
+! LLVM-LABEL: vec_sldw_test_r4i2
 subroutine vec_sldw_test_r4i2(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1279,7 +1279,7 @@ subroutine vec_sldw_test_r4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r4i2
 
-! CHECK-LABEL: vec_sldw_test_r4i4
+! LLVM-LABEL: vec_sldw_test_r4i4
 subroutine vec_sldw_test_r4i4(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1301,7 +1301,7 @@ subroutine vec_sldw_test_r4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r4i4
 
-! CHECK-LABEL: vec_sldw_test_r4i8
+! LLVM-LABEL: vec_sldw_test_r4i8
 subroutine vec_sldw_test_r4i8(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -1323,7 +1323,7 @@ subroutine vec_sldw_test_r4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r4i8
 
-! CHECK-LABEL: vec_sldw_test_r8i1
+! LLVM-LABEL: vec_sldw_test_r8i1
 subroutine vec_sldw_test_r8i1(arg1, arg2)
   vector(real(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -1345,7 +1345,7 @@ subroutine vec_sldw_test_r8i1(arg1, arg2)
 ! BE-LLVMIR: store <2 x double> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r8i1
 
-! CHECK-LABEL: vec_sldw_test_r8i2
+! LLVM-LABEL: vec_sldw_test_r8i2
 subroutine vec_sldw_test_r8i2(arg1, arg2)
   vector(real(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1367,7 +1367,7 @@ subroutine vec_sldw_test_r8i2(arg1, arg2)
 ! BE-LLVMIR: store <2 x double> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r8i2
 
-! CHECK-LABEL: vec_sldw_test_r8i4
+! LLVM-LABEL: vec_sldw_test_r8i4
 subroutine vec_sldw_test_r8i4(arg1, arg2)
   vector(real(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1389,7 +1389,7 @@ subroutine vec_sldw_test_r8i4(arg1, arg2)
 ! BE-LLVMIR: store <2 x double> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r8i4
 
-! CHECK-LABEL: vec_sldw_test_r8i8
+! LLVM-LABEL: vec_sldw_test_r8i8
 subroutine vec_sldw_test_r8i8(arg1, arg2)
   vector(real(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)

From 01fbc5658cdfa152519e2d0842ccf7d91aaeaeaf Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <71938912+akroviakov@users.noreply.github.com>
Date: Tue, 28 May 2024 14:54:37 +0200
Subject: [PATCH 04/89] [mlir][vector] Add support for linearizing Insert
 VectorOp in VectorLinearize (#92370)

Building on top of
[#88204](https://github.com/llvm/llvm-project/pull/88204), this PR adds
support for converting `vector.insert` into an equivalent
`vector.shuffle` operation that operates on linearized (1-D) vectors.
---
 .../Vector/Transforms/VectorLinearize.cpp     | 97 ++++++++++++++++++-
 mlir/test/Dialect/Vector/linearize.mlir       | 29 ++++++
 2 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 802a64b0805ee4..156bf742f6297a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -44,6 +44,19 @@ static bool isLessThanTargetBitWidth(Operation *op, unsigned targetBitWidth) {
   return true;
 }
 
+static bool isLessThanOrEqualTargetBitWidth(Type t, unsigned targetBitWidth) {
+  VectorType vecType = dyn_cast<VectorType>(t);
+  // Reject index since getElementTypeBitWidth will abort for Index types.
+  if (!vecType || vecType.getElementType().isIndex())
+    return false;
+  // There are no dimension to fold if it is a 0-D vector.
+  if (vecType.getRank() == 0)
+    return false;
+  unsigned trailingVecDimBitWidth =
+      vecType.getShape().back() * vecType.getElementTypeBitWidth();
+  return trailingVecDimBitWidth <= targetBitWidth;
+}
+
 namespace {
 struct LinearizeConstant final : OpConversionPattern<arith::ConstantOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -355,6 +368,88 @@ struct LinearizeVectorExtract final
     return success();
   }
 
+private:
+  unsigned targetVectorBitWidth;
+};
+
+/// This pattern converts the InsertOp to a ShuffleOp that works on a
+/// linearized vector.
+/// Following,
+///   vector.insert %source %destination [ position ]
+/// is converted to :
+///   %source_1d = vector.shape_cast %source
+///   %destination_1d = vector.shape_cast %destination
+///   %out_1d = vector.shuffle %destination_1d, %source_1d [ shuffle_indices_1d
+///   ] %out_nd = vector.shape_cast %out_1d
+/// `shuffle_indices_1d` is computed using the position of the original insert.
+struct LinearizeVectorInsert final
+    : public OpConversionPattern<vector::InsertOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LinearizeVectorInsert(
+      const TypeConverter &typeConverter, MLIRContext *context,
+      unsigned targetVectBitWidth = std::numeric_limits<unsigned>::max(),
+      PatternBenefit benefit = 1)
+      : OpConversionPattern(typeConverter, context, benefit),
+        targetVectorBitWidth(targetVectBitWidth) {}
+  LogicalResult
+  matchAndRewrite(vector::InsertOp insertOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type dstTy = getTypeConverter()->convertType(insertOp.getDestVectorType());
+    assert(!(insertOp.getDestVectorType().isScalable() ||
+             cast<VectorType>(dstTy).isScalable()) &&
+           "scalable vectors are not supported.");
+
+    if (!isLessThanOrEqualTargetBitWidth(insertOp.getSourceType(),
+                                         targetVectorBitWidth))
+      return rewriter.notifyMatchFailure(
+          insertOp, "Can't flatten since targetBitWidth < OpSize");
+
+    // dynamic position is not supported
+    if (insertOp.hasDynamicPosition())
+      return rewriter.notifyMatchFailure(insertOp,
+                                         "dynamic position is not supported.");
+    auto srcTy = insertOp.getSourceType();
+    auto srcAsVec = dyn_cast<VectorType>(srcTy);
+    uint64_t srcSize = 0;
+    if (srcAsVec) {
+      srcSize = srcAsVec.getNumElements();
+    } else {
+      return rewriter.notifyMatchFailure(insertOp,
+                                         "scalars are not supported.");
+    }
+
+    auto dstShape = insertOp.getDestVectorType().getShape();
+    const auto dstSize = insertOp.getDestVectorType().getNumElements();
+    auto dstSizeForOffsets = dstSize;
+
+    // compute linearized offset
+    int64_t linearizedOffset = 0;
+    auto offsetsNd = insertOp.getStaticPosition();
+    for (auto [dim, offset] : llvm::enumerate(offsetsNd)) {
+      dstSizeForOffsets /= dstShape[dim];
+      linearizedOffset += offset * dstSizeForOffsets;
+    }
+
+    llvm::SmallVector<int64_t, 2> indices(dstSize);
+    auto origValsUntil = indices.begin();
+    std::advance(origValsUntil, linearizedOffset);
+    std::iota(indices.begin(), origValsUntil,
+              0); // original values that remain [0, offset)
+    auto newValsUntil = origValsUntil;
+    std::advance(newValsUntil, srcSize);
+    std::iota(origValsUntil, newValsUntil,
+              dstSize); // new values [offset, offset+srcNumElements)
+    std::iota(newValsUntil, indices.end(),
+              linearizedOffset + srcSize); // the rest of original values
+                                           // [offset+srcNumElements, end)
+
+    rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
+        insertOp, dstTy, adaptor.getDest(), adaptor.getSource(),
+        rewriter.getI64ArrayAttr(indices));
+
+    return success();
+  }
+
 private:
   unsigned targetVectorBitWidth;
 };
@@ -410,6 +505,6 @@ void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns(
                    : true;
       });
   patterns.add<LinearizeVectorShuffle, LinearizeVectorExtract,
-               LinearizeVectorExtractStridedSlice>(
+               LinearizeVectorInsert, LinearizeVectorExtractStridedSlice>(
       typeConverter, patterns.getContext(), targetBitWidth);
 }
diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir
index b29ceab5783d7a..31a59b809a74ba 100644
--- a/mlir/test/Dialect/Vector/linearize.mlir
+++ b/mlir/test/Dialect/Vector/linearize.mlir
@@ -245,3 +245,32 @@ func.func @test_vector_extract(%arg0: vector<2x8x2xf32>) -> vector<8x2xf32> {
   %0 = vector.extract %arg0[1]: vector<8x2xf32> from vector<2x8x2xf32>
   return %0 : vector<8x2xf32>
 }
+
+// -----
+// ALL-LABEL: test_vector_insert
+// ALL-SAME: (%[[DEST:.*]]: vector<2x8x4xf32>, %[[SRC:.*]]: vector<8x4xf32>) -> vector<2x8x4xf32> {
+func.func @test_vector_insert(%arg0: vector<2x8x4xf32>, %arg1: vector<8x4xf32>) -> vector<2x8x4xf32> {
+  // DEFAULT: %[[ARG_SRC:.*]] = vector.shape_cast %[[SRC]] : vector<8x4xf32> to vector<32xf32>
+  // DEFAULT: %[[ARG_DEST:.*]] = vector.shape_cast %[[DEST]] : vector<2x8x4xf32> to vector<64xf32>
+  // DEFAULT: %[[SHUFFLE:.*]] = vector.shuffle %[[ARG_DEST]], %[[ARG_SRC]]
+  // DEFAULT-SAME: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+  // DEFAULT-SAME: 88, 89, 90, 91, 92, 93, 94, 95, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+  // DEFAULT-SAME: 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<64xf32>, vector<32xf32>
+  // DEFAULT: %[[RES:.*]] = vector.shape_cast %[[SHUFFLE]] : vector<64xf32> to vector<2x8x4xf32>
+  // DEFAULT: return %[[RES]] : vector<2x8x4xf32>
+
+  // BW-128: %[[ARG_SRC:.*]] = vector.shape_cast %[[SRC]] : vector<8x4xf32> to vector<32xf32>
+  // BW-128: %[[ARG_DEST:.*]] = vector.shape_cast %[[DEST]] : vector<2x8x4xf32> to vector<64xf32>
+  // BW-128: %[[SHUFFLE:.*]] = vector.shuffle %[[ARG_DEST]], %[[ARG_SRC]]
+  // BW-128-SAME: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+  // BW-128-SAME: 88, 89, 90, 91, 92, 93, 94, 95, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+  // BW-128-SAME: 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<64xf32>, vector<32xf32>
+  // BW-128: %[[RES:.*]] = vector.shape_cast %[[SHUFFLE]] : vector<64xf32> to vector<2x8x4xf32>
+  // BW-128: return %[[RES]] : vector<2x8x4xf32>
+
+  // BW-0: %[[RES:.*]] = vector.insert %[[SRC]], %[[DEST]] [0] : vector<8x4xf32> into vector<2x8x4xf32>
+  // BW-0: return %[[RES]] : vector<2x8x4xf32>
+
+  %0 = vector.insert %arg1, %arg0[0]: vector<8x4xf32> into vector<2x8x4xf32>
+  return %0 : vector<2x8x4xf32>
+}

From bdd4e8b1c011a6cf30171d365b58327a4e321ba0 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Tue, 28 May 2024 15:03:40 +0200
Subject: [PATCH 05/89] [bazel] Port 17ecd23f6932c87fcc8b2b8675762d50f3d53056

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index d1a2c6f11d98a7..a67f20533ae220 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2232,7 +2232,7 @@ llvm_target_lib_list = [lib for lib in [
             ("-gen-callingconv", "lib/Target/X86/X86GenCallingConv.inc"),
             ("-gen-subtarget", "lib/Target/X86/X86GenSubtargetInfo.inc"),
             ("-gen-x86-fold-tables -asmwriternum=1", "lib/Target/X86/X86GenFoldTables.inc"),
-            ("-gen-x86-compress-evex-tables", "lib/Target/X86/X86GenCompressEVEXTables.inc"),
+            ("-gen-x86-instr-mapping", "lib/Target/X86/X86GenInstrMapping.inc"),
             ("-gen-exegesis", "lib/Target/X86/X86GenExegesis.inc"),
             ("-gen-x86-mnemonic-tables -asmwriternum=1", "lib/Target/X86/X86GenMnemonicTables.inc"),
         ],

From 5988c798de617cb35491c42de388b98b4c175421 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 28 May 2024 21:08:17 +0800
Subject: [PATCH 06/89] [X86][tablgen] Add assertions when emitting NF
 transform table

---
 llvm/utils/TableGen/X86InstrMappingEmitter.cpp | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
index a8970d8bcbacdc..d89a1f078328b5 100644
--- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
+++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
@@ -277,8 +277,22 @@ void X86InstrMappingEmitter::emitNFTransformTable(
     if (Pos == std::string::npos)
       continue;
 
-    if (auto *NewRec = Records.getDef(Name.erase(Pos, 3)))
+    if (auto *NewRec = Records.getDef(Name.erase(Pos, 3))) {
+#ifndef NDEBUG
+      auto ClobberEFLAGS = [](const Record *R) {
+        return llvm::any_of(
+            R->getValueAsListOfDefs("Defs"),
+            [](const Record *Def) { return Def->getName() == "EFLAGS"; });
+      };
+      if (ClobberEFLAGS(Rec))
+        report_fatal_error("EFLAGS should not be clobbered by " +
+                           Rec->getName());
+      if (!ClobberEFLAGS(NewRec))
+        report_fatal_error("EFLAGS should be clobbered by " +
+                           NewRec->getName());
+#endif
       Table.push_back(std::pair(&Target.getInstruction(NewRec), Inst));
+    }
   }
   printTable(Table, "X86NFTransformTable", "GET_X86_NF_TRANSFORM_TABLE", OS);
 }

From 2c7c9df6ba3e86d7286476e875e215b64059c590 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 28 May 2024 09:15:00 -0400
Subject: [PATCH 07/89] [gn] port 17ecd23f6932 (-gen-x86-instr-mapping)

---
 llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn | 6 +++---
 llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
index 78a9d20812ef9b..8264f6d73e791e 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
@@ -12,9 +12,9 @@ tablegen("X86GenDAGISel") {
   td_file = "X86.td"
 }
 
-tablegen("X86GenCompressEVEXTables") {
+tablegen("X86GenInstrMapping") {
   visibility = [ ":LLVMX86CodeGen" ]
-  args = [ "-gen-x86-compress-evex-tables" ]
+  args = [ "-gen-x86-instr-mapping" ]
   td_file = "X86.td"
 }
 
@@ -48,11 +48,11 @@ tablegen("X86GenRegisterBank") {
 static_library("LLVMX86CodeGen") {
   deps = [
     ":X86GenCallingConv",
-    ":X86GenCompressEVEXTables",
     ":X86GenDAGISel",
     ":X86GenFastISel",
     ":X86GenFoldTables",
     ":X86GenGlobalISel",
+    ":X86GenInstrMapping",
     ":X86GenRegisterBank",
     "MCTargetDesc",
     "TargetInfo",
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
index f3ae5b5899ac6a..2e11d25767cd00 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
@@ -64,7 +64,7 @@ executable("llvm-tblgen") {
     "SearchableTableEmitter.cpp",
     "SubtargetEmitter.cpp",
     "WebAssemblyDisassemblerEmitter.cpp",
-    "X86CompressEVEXTablesEmitter.cpp",
+    "X86InstrMappingEmitter.cpp",
     "X86DisassemblerTables.cpp",
     "X86FoldTablesEmitter.cpp",
     "X86MnemonicTables.cpp",

From de327865c0e255bc799458ce34bc913f598b4261 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Tue, 28 May 2024 14:16:09 +0100
Subject: [PATCH 08/89] [flang][HLFIR][NFC] Reduce HLFIR to FIR conversion
 boilerplate (#93539)

The pass constructor can be generated automatically.

This pass is module-level and then runs on all relevant intrinsic
operations inside of the module, no matter what top level operation they
are inside of.
---
 flang/include/flang/Optimizer/HLFIR/Passes.h          | 4 ----
 flang/include/flang/Optimizer/HLFIR/Passes.td         | 1 -
 flang/include/flang/Tools/CLOptions.inc               | 2 +-
 flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp | 4 ----
 4 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.h b/flang/include/flang/Optimizer/HLFIR/Passes.h
index edefe36de00c16..83388d0527e192 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.h
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.h
@@ -20,10 +20,6 @@
 
 namespace hlfir {
 #define GEN_PASS_DECL
-#include "flang/Optimizer/HLFIR/Passes.h.inc"
-
-std::unique_ptr<mlir::Pass> createConvertHLFIRtoFIRPass();
-
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
 } // namespace hlfir
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index 1dd2e3dc81911f..ed49f5093c9652 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -12,7 +12,6 @@
 include "mlir/Pass/PassBase.td"
 def ConvertHLFIRtoFIR : Pass<"convert-hlfir-to-fir", "::mlir::ModuleOp"> {
   let summary = "Lower High-Level FIR to FIR";
-  let constructor = "hlfir::createConvertHLFIRtoFIRPass()";
   let dependentDialects = [
     "mlir::func::FuncDialect",
   ];
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index bb3c90ebc04d44..61ea7a7f9bbdd2 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -331,7 +331,7 @@ inline void createHLFIRToFIRPassPipeline(
   pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
   pm.addPass(hlfir::createBufferizeHLFIR());
-  pm.addPass(hlfir::createConvertHLFIRtoFIRPass());
+  pm.addPass(hlfir::createConvertHLFIRtoFIR());
 }
 
 /// Create a pass pipeline for handling certain OpenMP transformations needed
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index e56595d1c8e232..b8823bfa59f8f2 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -789,7 +789,3 @@ class ConvertHLFIRtoFIR
 };
 
 } // namespace
-
-std::unique_ptr<mlir::Pass> hlfir::createConvertHLFIRtoFIRPass() {
-  return std::make_unique<ConvertHLFIRtoFIR>();
-}

From 44861c7ac563f9e994305e22f2dca1c4f37265e4 Mon Sep 17 00:00:00 2001
From: Sayan Saha <sayans@mathworks.com>
Date: Tue, 28 May 2024 09:21:46 -0400
Subject: [PATCH 09/89] [mlir] [linalg] Check for dim shape to decide unit dim
 for each operand in dropUnitDims pass. (#93317)

`mlir-opt --linalg-fold-unit-extent-dims` pass on the following IR

```
#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
module {
  func.func @main(%arg0: tensor<1x?x?x1xf32>, %arg1: index) -> tensor<?x1x61x1xf32> {
    %cst = arith.constant dense<1.000000e+00> : tensor<1x1x1x1xf32>
    %0 = tensor.empty(%arg1) : tensor<?x1x61x1xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %cst : tensor<1x?x?x1xf32>, tensor<1x1x1x1xf32>) outs(%0 : tensor<?x1x61x1xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %2 = arith.mulf %in, %in_0 : f32
      %3 = arith.addf %out, %2 : f32
      linalg.yield %3 : f32
    } -> tensor<?x1x61x1xf32>
    return %1 : tensor<?x1x61x1xf32>
  }
}
```

produces an incorrect tensor.expand_shape operation:

```
error: 'tensor.expand_shape' op expected dimension 0 of collapsed type to be dynamic since one or more of the corresponding dimensions in the expanded type is dynamic
    %1 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %cst : tensor<1x?x?x1xf32>, tensor<1x1x1x1xf32>) outs(%0 : tensor<?x1x61x1xf32>) {
         ^
/mathworks/devel/sandbox/sayans/geckWorks/g3294570/repro.mlir:8:10: note: see current operation: %5 = "tensor.expand_shape"(%4) <{reassociation = [[0, 1, 2, 3]]}> : (tensor<61xf32>) -> tensor<?x1x61x1xf32>
// -----// IR Dump After LinalgFoldUnitExtentDimsPass Failed (linalg-fold-unit-extent-dims) //----- //
#map = affine_map<(d0) -> (0, d0)>
#map1 = affine_map<(d0) -> ()>
#map2 = affine_map<(d0) -> (d0)>
"builtin.module"() ({
  "func.func"() <{function_type = (tensor<1x?x?x1xf32>, index) -> tensor<?x1x61x1xf32>, sym_name = "main"}> ({
  ^bb0(%arg0: tensor<1x?x?x1xf32>, %arg1: index):
    %0 = "arith.constant"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
    %1 = "tensor.collapse_shape"(%arg0) <{reassociation = [[0, 1], [2, 3]]}> : (tensor<1x?x?x1xf32>) -> tensor<?x?xf32>
    %2 = "tensor.empty"() : () -> tensor<61xf32>
    %3 = "tensor.empty"() : () -> tensor<61xf32>
    %4 = "linalg.generic"(%1, %0, %2, %3) <{indexing_maps = [#map, #map1, #map2, #map2], iterator_types = [#linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 3, 1>}> ({
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
      %6 = "arith.mulf"(%arg2, %arg3) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      %7 = "arith.addf"(%arg4, %6) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      "linalg.yield"(%7) : (f32) -> ()
    }) : (tensor<?x?xf32>, tensor<f32>, tensor<61xf32>, tensor<61xf32>) -> tensor<61xf32>
    %5 = "tensor.expand_shape"(%4) <{reassociation = [[0, 1, 2, 3]]}> : (tensor<61xf32>) -> tensor<?x1x61x1xf32>
    "func.return"(%5) : (tensor<?x1x61x1xf32>) -> ()
  }) : () -> ()
}) : () -> ()
```

The reason of this is because the dimension `d0` is determined to be an
unit-dim that can be dropped based on the dimensions of operand `arg0`
to `linalg.generic`. Later on when iterating over operand `outs` the
dimension `d0` is determined to be an unit-dim even though the shape
corresponding to it is `Shape::kDynamic`. For the `linalg.generic` to be
valid `d0` of `outs` does need to be `1` but that isn't properly
processed in the current implementation and the dimension is dropped
resulting in `outs` operand to be `tensor<61xf32>` in the example.

The fix is to also check that the dimension shape is actually `1` before
dropping the dimension. The IR after the fix is:

```
#map = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0) -> (0, d0)>
#map2 = affine_map<(d0) -> ()>
module {
  func.func @main(%arg0: tensor<1x?x?x1xf32>, %arg1: index) -> tensor<?x1x61x1xf32> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<1.000000e+00> : tensor<f32>
    %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
    %0 = tensor.empty(%arg1) : tensor<?x61xf32>
    %1 = affine.apply #map()[%arg1, %c1]
    %2 = tensor.empty(%1) : tensor<?x61xf32>
    %3 = linalg.generic {indexing_maps = [#map1, #map2, #map1, #map1], iterator_types = ["parallel"]} ins(%collapsed, %cst, %0 : tensor<?x?xf32>, tensor<f32>, tensor<?x61xf32>) outs(%2 : tensor<?x61xf32>) {
    ^bb0(%in: f32, %in_0: f32, %in_1: f32, %out: f32):
      %4 = arith.mulf %in, %in_0 : f32
      %5 = arith.addf %in_1, %4 : f32
      linalg.yield %5 : f32
    } -> tensor<?x61xf32>
    %expanded = tensor.expand_shape %3 [[0, 1], [2, 3]] output_shape [%c0, 1, 61, 1] : tensor<?x61xf32> into tensor<?x1x61x1xf32>
    return %expanded : tensor<?x1x61x1xf32>
  }
}
```
---
 .../Linalg/Transforms/DropUnitDims.cpp        |  3 +-
 .../Dialect/Linalg/drop-unit-extent-dims.mlir | 43 +++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index 65efa18af18f65..c0829397f1f851 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -351,7 +351,8 @@ static UnitExtentReplacementInfo dropUnitExtentFromOperandMetadata(
   auto isUnitDim = [&](unsigned dim) {
     if (auto dimExpr = dyn_cast<AffineDimExpr>(exprs[dim])) {
       unsigned oldPosition = dimExpr.getPosition();
-      return !oldDimsToNewDimsMap.count(oldPosition);
+      return !oldDimsToNewDimsMap.count(oldPosition) &&
+             (operandShape[dim] == 1);
     }
     // Handle the other case where the shape is 1, and is accessed using a
     // constant 0.
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index a9cbaaf7fdc485..8f9b12880adcf7 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -1087,3 +1087,46 @@ func.func @drop_known_unit_constant_low_high(%arg0: tensor<1x383x128xf32>) -> te
 //       CHECK:   } : tensor<383x128xf32> to tensor<384x128xf32>
 //       CHECK:   tensor.expand_shape %[[PADDED]]
 //  CHECK-SAME:     {{\[}}[0, 1], [2]] output_shape [1, 384, 128] : tensor<384x128xf32> into tensor<1x384x128xf32>
+
+// -----
+
+// CHECK: #[[$MAP0:.+]] = affine_map<()[s0, s1] -> (s0 * s1)>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (0, d0)>
+// CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> ()>
+
+// CHECK-LABEL: func @drop_unit_dim_corresponding_to_dynamic_dim
+// CHECK-SAME:                    %[[ARG0:.*]]: tensor<1x?x?x1xf32>,
+// CHECK-SAME:                    %[[ARG1:.*]]: index) -> tensor<?x1x61x1xf32> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = tensor.collapse_shape %[[ARG0]] {{\[\[}}0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
+// CHECK:           %[[VAL_4:.*]] = tensor.empty(%[[ARG1]]) : tensor<?x61xf32>
+// CHECK:           %[[VAL_5:.*]] = affine.apply #[[$MAP0]](){{\[}}%[[ARG1]], %[[VAL_1]]]
+// CHECK:           %[[VAL_6:.*]] = tensor.empty(%[[VAL_5]]) : tensor<?x61xf32>
+// CHECK:           %[[VAL_7:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel"]} ins(%[[VAL_3]], %[[VAL_2]], %[[VAL_4]] : tensor<?x?xf32>, tensor<f32>, tensor<?x61xf32>) outs(%[[VAL_6]] : tensor<?x61xf32>) {
+// CHECK:           ^bb0(%[[VAL_8:.*]]: f32, %[[VAL_9:.*]]: f32, %[[VAL_10:.*]]: f32, %[[VAL_11:.*]]: f32):
+// CHECK:             %[[VAL_12:.*]] = arith.mulf %[[VAL_8]], %[[VAL_9]] : f32
+// CHECK:             %[[VAL_13:.*]] = arith.addf %[[VAL_10]], %[[VAL_12]] : f32
+// CHECK:             linalg.yield %[[VAL_13]] : f32
+// CHECK:           } -> tensor<?x61xf32>
+// CHECK:           %[[VAL_14:.*]] = tensor.expand_shape %[[VAL_7]] {{\[\[}}0, 1], [2, 3]] output_shape {{\[}}%[[VAL_0]], 1, 61, 1] : tensor<?x61xf32> into tensor<?x1x61x1xf32>
+// CHECK:           return %[[VAL_14]] : tensor<?x1x61x1xf32>
+// CHECK:         }
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+module {
+  func.func @drop_unit_dim_corresponding_to_dynamic_dim(%arg0: tensor<1x?x?x1xf32>, %arg1: index) -> tensor<?x1x61x1xf32> {
+    %cst = arith.constant dense<1.000000e+00> : tensor<1x1x1x1xf32>
+    %0 = tensor.empty(%arg1) : tensor<?x1x61x1xf32>
+    %1 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %cst : tensor<1x?x?x1xf32>, tensor<1x1x1x1xf32>) outs(%0 : tensor<?x1x61x1xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %2 = arith.mulf %in, %in_0 : f32
+      %3 = arith.addf %out, %2 : f32
+      linalg.yield %3 : f32
+    } -> tensor<?x1x61x1xf32>
+    return %1 : tensor<?x1x61x1xf32>
+  }
+}

From 24a12a9c85b1ec08ff597f43e3414271d8439a97 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 28 May 2024 09:22:55 -0400
Subject: [PATCH 10/89] [clang] Diagnose problematic diagnostic messages
 (#93229)

Clang has some unwritten rules about diagnostic wording regarding things
like punctuation and capitalization. This patch documents those rules
and adds some tablegen support for checking diagnostics follow the
rules.

Specifically: tablegen now checks that a diagnostic does not start with
a capital letter or end with punctuation, except for the usual
exceptions like proper nouns or ending with a question.

Now that the code base is clean of such issues, the diagnostics are
emitted as an error rather than a warning to ensure that failure to
follow these rules is either addressed by an author, or a new exception
is added to the checking logic.
---
 clang/docs/InternalsManual.rst                |  38 ++++
 clang/test/TableGen/deferred-diag.td          |  10 +-
 clang/test/TableGen/text-substitution.td      |   4 +-
 clang/test/TableGen/wording-errors.td         |  55 +++++
 .../TableGen/ClangDiagnosticsEmitter.cpp      | 194 ++++++++++++++++++
 5 files changed, 294 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/TableGen/wording-errors.td

diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index b3e2b870ae5f9a..3d21e37784b363 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -123,6 +123,44 @@ severe that error recovery won't be able to recover sensibly from them (thus
 spewing a ton of bogus errors).  One example of this class of error are failure
 to ``#include`` a file.
 
+Diagnostic Wording
+^^^^^^^^^^^^^^^^^^
+The wording used for a diagnostic is critical because it is the only way for a
+user to know how to correct their code. Use the following suggestions when
+wording a diagnostic.
+
+* Diagnostics in Clang do not start with a capital letter and do not end with
+  punctuation.
+
+    * This does not apply to proper nouns like ``Clang`` or ``OpenMP``, to
+      acronyms like ``GCC`` or ``ARC``, or to language standards like ``C23``
+      or ``C++17``.
+    * A trailing question mark is allowed. e.g., ``unknown identifier %0; did
+      you mean %1?``.
+
+* Appropriately capitalize proper nouns like ``Clang``, ``OpenCL``, ``GCC``,
+  ``Objective-C``, etc and language standard versions like ``C11`` or ``C++11``.
+* The wording should be succinct. If necessary, use a semicolon to combine
+  sentence fragments instead of using complete sentences. e.g., prefer wording
+  like ``'%0' is deprecated; it will be removed in a future release of Clang``
+  over wording like ``'%0' is deprecated. It will be removed in a future release
+  of Clang``.
+* The wording should be actionable and avoid using standards terms or grammar
+  productions that a new user would not be familiar with. e.g., prefer wording
+  like ``missing semicolon`` over wording like ``syntax error`` (which is not
+  actionable) or ``expected unqualified-id`` (which uses standards terminology).
+* The wording should clearly explain what is wrong with the code rather than
+  restating what the code does. e.g., prefer wording like ``type %0 requires a
+  value in the range %1 to %2`` over wording like ``%0 is invalid``.
+* The wording should have enough contextual information to help the user
+  identify the issue in a complex expression. e.g., prefer wording like
+  ``both sides of the %0 binary operator are identical`` over wording like
+  ``identical operands to binary operator``.
+* Use single quotes to denote syntactic constructs or command line arguments
+  named in a diagnostic message. e.g., prefer wording like ``'this' pointer
+  cannot be null in well-defined C++ code`` over wording like ``this pointer
+  cannot be null in well-defined C++ code``.
+
 The Format String
 ^^^^^^^^^^^^^^^^^
 
diff --git a/clang/test/TableGen/deferred-diag.td b/clang/test/TableGen/deferred-diag.td
index c1906d4a9e45ec..d7e8e694c7b3e4 100644
--- a/clang/test/TableGen/deferred-diag.td
+++ b/clang/test/TableGen/deferred-diag.td
@@ -4,24 +4,24 @@ include "DiagnosticBase.inc"
 
 // Test usage of Deferrable and NonDeferrable in diagnostics.
 
-def test_default : Error<"This error is non-deferrable by default">;
+def test_default : Error<"this error is non-deferrable by default">;
 // CHECK-DAG: DIAG(test_default, {{.*}}SFINAE_SubstitutionFailure, false, true, true, false, 0)
 
-def test_deferrable : Error<"This error is deferrable">, Deferrable;
+def test_deferrable : Error<"this error is deferrable">, Deferrable;
 // CHECK-DAG: DIAG(test_deferrable, {{.*}} SFINAE_SubstitutionFailure, false, true, true, true, 0)
 
-def test_non_deferrable : Error<"This error is non-deferrable">, NonDeferrable;
+def test_non_deferrable : Error<"this error is non-deferrable">, NonDeferrable;
 // CHECK-DAG: DIAG(test_non_deferrable, {{.*}} SFINAE_SubstitutionFailure, false, true, true, false, 0)
 
 let Deferrable = 1 in {
 
-def test_let : Error<"This error is deferrable by let">;
+def test_let : Error<"this error is deferrable by let">;
 // CHECK-DAG: DIAG(test_let, {{.*}} SFINAE_SubstitutionFailure, false, true, true, true, 0)
 
 // Make sure TextSubstitution is allowed in the let Deferrable block.
 def textsub : TextSubstitution<"%select{text1|text2}0">;
 
-def test_let2 : Error<"This error is deferrable by let %sub{textsub}0">;
+def test_let2 : Error<"this error is deferrable by let %sub{textsub}0">;
 // CHECK-DAG: DIAG(test_let2, {{.*}} SFINAE_SubstitutionFailure, false, true, true, true, 0)
 
 }
diff --git a/clang/test/TableGen/text-substitution.td b/clang/test/TableGen/text-substitution.td
index aafdbe48c43bec..b0d030aca65134 100644
--- a/clang/test/TableGen/text-substitution.td
+++ b/clang/test/TableGen/text-substitution.td
@@ -26,8 +26,8 @@ def sub_test_rewrite : TextSubstitution<
 // CHECK-SAME: Q! %q1.
 // CHECK-SAME: PLACEHOLDER! %0.OBJCCLASS!
 // CHECK-SAME: %objcclass5. OBJCINSTANCE!
-// CHECK-SAME: %objcinstance4.  DONE!",
-def test_rewrite: Error<"%sub{sub_test_rewrite}5,4,3,2,1,0 DONE!">;
+// CHECK-SAME: %objcinstance4.  DONE",
+def test_rewrite: Error<"%sub{sub_test_rewrite}5,4,3,2,1,0 DONE">;
 
 def test_sub_basic : Error<"%sub{yes_no}0">;
 // CHECK: test_sub_basic
diff --git a/clang/test/TableGen/wording-errors.td b/clang/test/TableGen/wording-errors.td
new file mode 100644
index 00000000000000..eb5eb2f547c782
--- /dev/null
+++ b/clang/test/TableGen/wording-errors.td
@@ -0,0 +1,55 @@
+// RUN: not clang-tblgen -gen-clang-diags-defs -I%S %s -o /dev/null 2>&1 | FileCheck %s
+include "DiagnosticBase.inc"
+
+// Ensure we catch a capital letter at the start of a diagnostic.
+def zero : Error<
+  "This is bad">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'This' is invalid
+
+// Test that we also correctly handle selections.
+def one : Error<
+  "%select{|or}0 That">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'That' is invalid
+def two : Error<
+  "%select{as does|}0 This">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'This' is invalid
+def three : Error<
+  "%select{and||of course}0 Whatever">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'Whatever' is invalid
+
+// Test that we accept the following cases.
+def four : Error<
+  "this is fine">;
+def five : Error<
+  "%select{this|is|also}0 Fine">;
+def six : Error<
+  "%select{this|is|also|}0 fine">;
+def seven : Error<
+  "%select{ARC|C|C23|C++14|OpenMP}0 are also fine">;
+
+// Next, test that we catch punctuation at the end of the diagnostic.
+def eight : Error<
+  "punctuation is bad.">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def nine : Error<
+  "it's really bad!">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '!' is invalid
+def ten : Error<
+  "we also catch %select{punctuation.|in select}0">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def eleven : Error<
+  "and %select{|here.}0">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def twelve : Error<
+  "and %select{here.|}0">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def thirteen : Error<
+  "and even %select{|here.|}0">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def fourteen : Error<
+  "and %select{here}0.">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+
+// Test that we accept the following cases.
+def fifteen : Error<
+  "question marks are intentionally okay?">;
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index f564689fff7cf1..b290530444d2ab 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -1213,6 +1213,197 @@ static bool isRemark(const Record &Diag) {
   return ClsName == "CLASS_REMARK";
 }
 
+// Presumes the text has been split at the first whitespace or hyphen.
+static bool isExemptAtStart(StringRef Text) {
+  // Fast path, the first character is lowercase or not alphanumeric.
+  if (Text.empty() || isLower(Text[0]) || !isAlnum(Text[0]))
+    return true;
+
+  // If the text is all uppercase (or numbers, +, or _), then we assume it's an
+  // acronym and that's allowed. This covers cases like ISO, C23, C++14, and
+  // OBJECT_MODE. However, if there's only a single letter other than "C", we
+  // do not exempt it so that we catch a case like "A really bad idea" while
+  // still allowing a case like "C does not allow...".
+  if (llvm::all_of(Text, [](char C) {
+        return isUpper(C) || isDigit(C) || C == '+' || C == '_';
+      }))
+    return Text.size() > 1 || Text[0] == 'C';
+
+  // Otherwise, there are a few other exemptions.
+  return StringSwitch<bool>(Text)
+      .Case("AddressSanitizer", true)
+      .Case("CFString", true)
+      .Case("Clang", true)
+      .Case("Fuchsia", true)
+      .Case("GNUstep", true)
+      .Case("IBOutletCollection", true)
+      .Case("Microsoft", true)
+      .Case("Neon", true)
+      .StartsWith("NSInvocation", true) // NSInvocation, NSInvocation's
+      .Case("Objective", true) // Objective-C (hyphen is a word boundary)
+      .Case("OpenACC", true)
+      .Case("OpenCL", true)
+      .Case("OpenMP", true)
+      .Case("Pascal", true)
+      .Case("Swift", true)
+      .Case("Unicode", true)
+      .Case("Vulkan", true)
+      .Case("WebAssembly", true)
+      .Default(false);
+}
+
+// Does not presume the text has been split at all.
+static bool isExemptAtEnd(StringRef Text) {
+  // Rather than come up with a list of characters that are allowed, we go the
+  // other way and look only for characters that are not allowed.
+  switch (Text.back()) {
+  default:
+    return true;
+  case '?':
+    // Explicitly allowed to support "; did you mean?".
+    return true;
+  case '.':
+  case '!':
+    return false;
+  }
+}
+
+static void verifyDiagnosticWording(const Record &Diag) {
+  StringRef FullDiagText = Diag.getValueAsString("Summary");
+
+  auto DiagnoseStart = [&](StringRef Text) {
+    // Verify that the text does not start with a capital letter, except for
+    // special cases that are exempt like ISO and C++. Find the first word
+    // by looking for a word breaking character.
+    char Separators[] = {' ', '-', ',', '}'};
+    auto Iter = std::find_first_of(
+        Text.begin(), Text.end(), std::begin(Separators), std::end(Separators));
+
+    StringRef First = Text.substr(0, Iter - Text.begin());
+    if (!isExemptAtStart(First)) {
+      PrintError(&Diag,
+                 "Diagnostics should not start with a capital letter; '" +
+                     First + "' is invalid");
+    }
+  };
+
+  auto DiagnoseEnd = [&](StringRef Text) {
+    // Verify that the text does not end with punctuation like '.' or '!'.
+    if (!isExemptAtEnd(Text)) {
+      PrintError(&Diag, "Diagnostics should not end with punctuation; '" +
+                            Text.substr(Text.size() - 1, 1) + "' is invalid");
+    }
+  };
+
+  // If the diagnostic starts with %select, look through it to see whether any
+  // of the options will cause a problem.
+  if (FullDiagText.starts_with("%select{")) {
+    // Do a balanced delimiter scan from the start of the text to find the
+    // closing '}', skipping intermediary {} pairs.
+
+    size_t BraceCount = 1;
+    constexpr size_t PercentSelectBraceLen = sizeof("%select{") - 1;
+    auto Iter = FullDiagText.begin() + PercentSelectBraceLen;
+    for (auto End = FullDiagText.end(); Iter != End; ++Iter) {
+      char Ch = *Iter;
+      if (Ch == '{')
+        ++BraceCount;
+      else if (Ch == '}')
+        --BraceCount;
+      if (!BraceCount)
+        break;
+    }
+    // Defending against a malformed diagnostic string.
+    if (BraceCount != 0)
+      return;
+
+    StringRef SelectText =
+        FullDiagText.substr(PercentSelectBraceLen, Iter - FullDiagText.begin() -
+                                                       PercentSelectBraceLen);
+    SmallVector<StringRef, 4> SelectPieces;
+    SelectText.split(SelectPieces, '|');
+
+    // Walk over all of the individual pieces of select text to see if any of
+    // them start with an invalid character. If any of the select pieces is
+    // empty, we need to look at the first word after the %select to see
+    // whether that is invalid or not. If all of the pieces are fine, then we
+    // don't need to check anything else about the start of the diagnostic.
+    bool CheckSecondWord = false;
+    for (StringRef Piece : SelectPieces) {
+      if (Piece.empty())
+        CheckSecondWord = true;
+      else
+        DiagnoseStart(Piece);
+    }
+
+    if (CheckSecondWord) {
+      // There was an empty select piece, so we need to check the second
+      // word. This catches situations like '%select{|fine}0 Not okay'. Add
+      // two to account for the closing curly brace and the number after it.
+      StringRef AfterSelect =
+          FullDiagText.substr(Iter - FullDiagText.begin() + 2).ltrim();
+      DiagnoseStart(AfterSelect);
+    }
+  } else {
+    // If the start of the diagnostic is not %select, we can check the first
+    // word and be done with it.
+    DiagnoseStart(FullDiagText);
+  }
+
+  // If the last character in the diagnostic is a number preceded by a }, scan
+  // backwards to see if this is for a %select{...}0. If it is, we need to look
+  // at each piece to see whether it ends in punctuation or not.
+  bool StillNeedToDiagEnd = true;
+  if (isDigit(FullDiagText.back()) && *(FullDiagText.end() - 2) == '}') {
+    // Scan backwards to find the opening curly brace.
+    size_t BraceCount = 1;
+    auto Iter = FullDiagText.end() - sizeof("}0");
+    for (auto End = FullDiagText.begin(); Iter != End; --Iter) {
+      char Ch = *Iter;
+      if (Ch == '}')
+        ++BraceCount;
+      else if (Ch == '{')
+        --BraceCount;
+      if (!BraceCount)
+        break;
+    }
+    // Defending against a malformed diagnostic string.
+    if (BraceCount != 0)
+      return;
+
+    // Continue the backwards scan to find the word before the '{' to see if it
+    // is 'select'.
+    constexpr size_t SelectLen = sizeof("select") - 1;
+    bool IsSelect =
+        (FullDiagText.substr(Iter - SelectLen - FullDiagText.begin(),
+                             SelectLen) == "select");
+    if (IsSelect) {
+      // Gather the content between the {} for the select in question so we can
+      // split it into pieces.
+      StillNeedToDiagEnd = false; // No longer need to handle the end.
+      StringRef SelectText =
+          FullDiagText.substr(Iter - FullDiagText.begin() + /*{*/ 1,
+                              FullDiagText.end() - Iter - /*pos before }0*/ 3);
+      SmallVector<StringRef, 4> SelectPieces;
+      SelectText.split(SelectPieces, '|');
+      for (StringRef Piece : SelectPieces) {
+        // Not worrying about a situation like: "this is bar. %select{foo|}0".
+        if (!Piece.empty())
+          DiagnoseEnd(Piece);
+      }
+    }
+  }
+
+  // If we didn't already cover the diagnostic because of a %select, handle it
+  // now.
+  if (StillNeedToDiagEnd)
+    DiagnoseEnd(FullDiagText);
+
+  // FIXME: This could also be improved by looking for instances of clang or
+  // gcc in the diagnostic and recommend Clang or GCC instead. However, this
+  // runs into odd situations like [[clang::warn_unused_result]],
+  // #pragma clang, or --unwindlib=libgcc.
+}
 
 /// ClangDiagsDefsEmitter - The top-level class emits .def files containing
 /// declarations of Clang diagnostics.
@@ -1273,6 +1464,9 @@ void clang::EmitClangDiagsDefs(RecordKeeper &Records, raw_ostream &OS,
     if (!Component.empty() && Component != R.getValueAsString("Component"))
       continue;
 
+    // Validate diagnostic wording for common issues.
+    verifyDiagnosticWording(R);
+
     OS << "DIAG(" << R.getName() << ", ";
     OS << R.getValueAsDef("Class")->getName();
     OS << ", (unsigned)diag::Severity::"

From 6e1a04247d6cc3295be8e3b14286f95983632e1c Mon Sep 17 00:00:00 2001
From: Tyker <tyker1@outlook.com>
Date: Tue, 28 May 2024 15:21:56 +0200
Subject: [PATCH 11/89] Fix failure after
 d46e37348ec3f8054b10bcbbe7c11149d7f61031

---
 llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir  |  8 ++++----
 llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir | 11 +++++++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
index d8f2b08adaf2fb..dc20a1577aa5bc 100644
--- a/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
+++ b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
@@ -3,16 +3,16 @@
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
 # RUN:   -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=0,ppc-xtoi-peephole-count=8 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=0-7 \
 # RUN:   | FileCheck %s --check-prefix=ALL
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=2 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=3-4 \
 # RUN:   | FileCheck %s --check-prefix=ONE-FIRSTSTORE
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=5,ppc-xtoi-peephole-count=2 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=5-6 \
 # RUN:   | FileCheck %s --check-prefix=ONE-SECONDSTORE
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=4 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=3-6 \
 # RUN:   | FileCheck %s --check-prefix=TWO
 
 ---
diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
index cf3ff291e26c6a..09f7ededa20c64 100644
--- a/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
+++ b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
@@ -3,16 +3,19 @@
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
 # RUN:   -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=0,ppc-per-op-peephole-count=6 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=0-5 \
 # RUN:   | FileCheck %s --check-prefix=ALL
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=1 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=0-5 \
+# RUN:   | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=3 \
 # RUN:   | FileCheck %s --check-prefix=ONE-FIRST-RLWINM
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=4,ppc-per-op-peephole-count=1 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=4 \
 # RUN:   | FileCheck %s --check-prefix=ONE-SECOND-RLWINM
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=2 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=3-4 \
 # RUN:   | FileCheck %s --check-prefix=TWO
 
 ---

From 8995ccc4460ed8a90dcc9bd023743a8f59458f50 Mon Sep 17 00:00:00 2001
From: Xu Zhang <simonzgx@gmail.com>
Date: Tue, 28 May 2024 21:29:31 +0800
Subject: [PATCH 12/89] [Clang] Add support for [[msvc::noinline]] attribute.
 (#91720)

Fixes #90941.
Add support for ``[[msvc::noinline]]`` attribute, which is actually an
alias of ``[[clang::noinline]]``.
---
 clang/include/clang/Basic/Attr.td    |  7 ++--
 clang/lib/Sema/SemaStmtAttr.cpp      |  2 +-
 clang/test/CodeGen/attr-noinline.cpp | 32 ++++++++++++++++++
 clang/test/Sema/attr-noinline.cpp    | 50 ++++++++++++++++++++++++++--
 4 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index e59cccccdd3690..ef9df1e9d8b4aa 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2025,9 +2025,12 @@ def Convergent : InheritableAttr {
 def NoInline : DeclOrStmtAttr {
   let Spellings = [CustomKeyword<"__noinline__">, GCC<"noinline">,
                    CXX11<"clang", "noinline">, C23<"clang", "noinline">,
+                   CXX11<"msvc", "noinline">, C23<"msvc", "noinline">,
                    Declspec<"noinline">];
-  let Accessors = [Accessor<"isClangNoInline", [CXX11<"clang", "noinline">,
-                                                C23<"clang", "noinline">]>];
+  let Accessors = [Accessor<"isStmtNoInline", [CXX11<"clang", "noinline">,
+                                               C23<"clang", "noinline">,
+                                               CXX11<"msvc", "noinline">,
+                                               C23<"msvc", "noinline">]>];
   let Documentation = [NoInlineDocs];
   let Subjects = SubjectList<[Function, Stmt], WarnDiag,
                              "functions and statements">;
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 8735d96c840793..82373fe96a8243 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -285,7 +285,7 @@ bool Sema::CheckAlwaysInlineAttr(const Stmt *OrigSt, const Stmt *CurSt,
 static Attr *handleNoInlineAttr(Sema &S, Stmt *St, const ParsedAttr &A,
                                 SourceRange Range) {
   NoInlineAttr NIA(S.Context, A);
-  if (!NIA.isClangNoInline()) {
+  if (!NIA.isStmtNoInline()) {
     S.Diag(St->getBeginLoc(), diag::warn_function_attribute_ignored_in_stmt)
         << "[[clang::noinline]]";
     return nullptr;
diff --git a/clang/test/CodeGen/attr-noinline.cpp b/clang/test/CodeGen/attr-noinline.cpp
index f0588cfecf4631..c1fb9941b5251d 100644
--- a/clang/test/CodeGen/attr-noinline.cpp
+++ b/clang/test/CodeGen/attr-noinline.cpp
@@ -9,6 +9,7 @@ static int baz(int x) {
 }
 
 [[clang::noinline]] bool noi() { }
+[[msvc::noinline]] bool ms_noi() { return true; }
 
 void foo(int i) {
   [[clang::noinline]] bar();
@@ -39,6 +40,31 @@ void foo(int i) {
 // CHECK: call noundef zeroext i1 @_Z3barv()
 }
 
+void ms_noi_check(int i) {
+  [[msvc::noinline]] bar();
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR:[0-9]+]]
+  [[msvc::noinline]] i = baz(i);
+// CHECK: call noundef i32 @_ZL3bazi({{.*}}) #[[NOINLINEATTR]]
+  [[msvc::noinline]] (i = 4, bar());
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+  [[msvc::noinline]] (void)(bar());
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+  [[msvc::noinline]] f(bar(), bar());
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+// CHECK: call void @_Z1fbb({{.*}}) #[[NOINLINEATTR]]
+  [[msvc::noinline]] [] { bar(); bar(); }(); // noinline only applies to the anonymous function call
+// CHECK: call void @"_ZZ12ms_noi_checkiENK3$_0clEv"(ptr {{[^,]*}} %ref.tmp) #[[NOINLINEATTR]]
+  [[msvc::noinline]] for (bar(); bar(); bar()) {}
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+  [[msvc::noinline]] ms_noi();
+// CHECK: call noundef zeroext i1 @_Z6ms_noiv()
+  ms_noi();
+// CHECK: call noundef zeroext i1 @_Z6ms_noiv()
+}
+
 struct S {
   friend bool operator==(const S &LHS, const S &RHS);
 };
@@ -50,6 +76,12 @@ void func(const S &s1, const S &s2) {
   bool b;
   [[clang::noinline]] b = s1 == s2;
 // CHECK: call noundef zeroext i1 @_ZeqRK1SS1_({{.*}}) #[[NOINLINEATTR]]
+
+  [[msvc::noinline]]g(s1 == s2);
+// CHECK: call noundef zeroext i1 @_ZeqRK1SS1_({{.*}}) #[[NOINLINEATTR]]
+// CHECK: call void @_Z1gb({{.*}}) #[[NOINLINEATTR]]
+  [[msvc::noinline]] b = s1 == s2;
+// CHECK: call noundef zeroext i1 @_ZeqRK1SS1_({{.*}}) #[[NOINLINEATTR]]
 }
 
 // CHECK: attributes #[[NOINLINEATTR]] = { noinline }
diff --git a/clang/test/Sema/attr-noinline.cpp b/clang/test/Sema/attr-noinline.cpp
index bd6505b9fe98ef..6da0e873af1b6a 100644
--- a/clang/test/Sema/attr-noinline.cpp
+++ b/clang/test/Sema/attr-noinline.cpp
@@ -2,9 +2,9 @@
 
 int bar();
 
-// expected-note@+1{{conflicting attribute is here}}
+// expected-note@+1 2 {{conflicting attribute is here}}
 [[gnu::always_inline]] void always_inline_fn(void) { }
-// expected-note@+1{{conflicting attribute is here}}
+// expected-note@+1 2 {{conflicting attribute is here}}
 [[gnu::flatten]] void flatten_fn(void) { }
 [[gnu::noinline]] void noinline_fn(void) { }
 
@@ -25,7 +25,21 @@ void foo() {
   __attribute__((noinline)) bar(); // expected-warning {{attribute is ignored on this statement as it only applies to functions; use '[[clang::noinline]]' on statements}}
 }
 
+void ms_noi_check() {
+  [[msvc::noinline]] bar();
+  [[msvc::noinline(0)]] bar(); // expected-error {{'noinline' attribute takes no arguments}}
+  int x;
+  [[msvc::noinline]] x = 0; // expected-warning {{'noinline' attribute is ignored because there exists no call expression inside the statement}}
+  [[msvc::noinline]] { asm("nop"); } // expected-warning {{'noinline' attribute is ignored because there exists no call expression inside the statement}}
+  [[msvc::noinline]] label: x = 1; // expected-warning {{'noinline' attribute only applies to functions and statements}}
+
+  [[msvc::noinline]] always_inline_fn(); // expected-warning {{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+  [[msvc::noinline]] flatten_fn(); // expected-warning {{statement attribute 'noinline' has higher precedence than function attribute 'flatten'}}
+  [[msvc::noinline]] noinline_fn();
+}
+
 [[clang::noinline]] static int i = bar(); // expected-warning {{'noinline' attribute only applies to functions and statements}}
+[[msvc::noinline]] static int j = bar(); // expected-warning {{'noinline' attribute only applies to functions and statements}}
 
 // This used to crash the compiler.
 template<int D>
@@ -69,7 +83,39 @@ int variadic_baz(int x) {
   [[clang::noinline]] return non_dependent(x) + (dependent<D>(x) + ...);
 }
 
+template<int D> [[clang::always_inline]]
+int qux(int x) { // #QUX
+  // expected-warning@+2{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+  // expected-note@#NO_DEP{{conflicting attribute is here}}
+  [[msvc::noinline]] non_dependent(x);
+  if constexpr (D>0) {
+    // expected-warning@+6{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+    // expected-note@#NO_DEP{{conflicting attribute is here}}
+    // expected-warning@+4 3{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+    // expected-note@#QUX 3{{conflicting attribute is here}}
+    // expected-note@#QUX_INST 3{{in instantiation}}
+    // expected-note@+1 3{{in instantiation}}
+    [[msvc::noinline]] return non_dependent(x), qux<D-1>(x + 1);
+  }
+  return x;
+}
+
+// We can't suppress if there is a variadic involved.
+template<int ... D>
+int variadic_qux(int x) {
+  // Diagnoses NO_DEP 2x, once during phase 1, the second during instantiation.
+  // Dianoses DEP 3x, once per variadic expansion.
+  // expected-warning@+5 2{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+  // expected-note@#NO_DEP 2{{conflicting attribute is here}}
+  // expected-warning@+3 3{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+  // expected-note@#DEP 3{{conflicting attribute is here}}
+  // expected-note@#QUX_VARIADIC_INST{{in instantiation}}
+  [[msvc::noinline]] return non_dependent(x) + (dependent<D>(x) + ...);
+}
+
 void use() {
   baz<3>(0); // #BAZ_INST
   variadic_baz<0, 1, 2>(0); // #VARIADIC_INST
+  qux<3>(0); // #QUX_INST
+  variadic_qux<0, 1, 2>(0); // #QUX_VARIADIC_INST
 }

From 2ace7bdcfe640c69bd4dcf508d39485e0ef7ea06 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Tue, 28 May 2024 15:38:02 +0200
Subject: [PATCH 13/89] [Clang] allow `` `@$ `` in raw string delimiters in
 C++26 (#93216)

And as an extension in older language modes.

Per https://eel.is/c++draft/lex.string#nt:d-char

Fixes #93130
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/include/clang/Basic/CharInfo.h          | 15 +++++++-------
 .../include/clang/Basic/DiagnosticLexKinds.td |  8 ++++++++
 clang/lib/Basic/CharInfo.cpp                  | 20 +++++++++----------
 clang/lib/Lex/Lexer.cpp                       | 11 +++++++++-
 clang/test/Lexer/cxx2c-raw-strings.cpp        | 12 +++++++++++
 6 files changed, 49 insertions(+), 18 deletions(-)
 create mode 100644 clang/test/Lexer/cxx2c-raw-strings.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 182f8b5824258e..6b746cda53c71b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -802,6 +802,7 @@ Bug Fixes to C++ Support
 - Fixed a regression introduced in Clang 18 causing a static function overloading a non-static function
   with the same parameters not to be diagnosed. (Fixes #GH93456).
 - Clang now diagnoses unexpanded parameter packs in attributes. (Fixes #GH93269).
+- Clang now allows ``@$``` in raw string literals. Fixes (#GH93130).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h
index d8079553118287..4d90528f7992e3 100644
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@@ -28,8 +28,7 @@ namespace charinfo {
     CHAR_LOWER    = 0x0040,  // a-z
     CHAR_UNDER    = 0x0080,  // _
     CHAR_PERIOD   = 0x0100,  // .
-    CHAR_RAWDEL   = 0x0200,  // {}[]#<>%:;?*+-/^&|~!=,"'
-    CHAR_PUNCT    = 0x0400   // `$@()
+    CHAR_PUNCT    = 0x0200,  // {}[]#<>%:;?*+-/^&|~!=,"'`$@()
   };
 
   enum {
@@ -152,7 +151,8 @@ LLVM_READONLY inline bool isHexDigit(unsigned char c) {
 /// Note that '_' is both a punctuation character and an identifier character!
 LLVM_READONLY inline bool isPunctuation(unsigned char c) {
   using namespace charinfo;
-  return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0;
+  return (InfoTable[c] &
+          (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT | CHAR_PUNCT)) != 0;
 }
 
 /// Return true if this character is an ASCII printable character; that is, a
@@ -160,8 +160,8 @@ LLVM_READONLY inline bool isPunctuation(unsigned char c) {
 /// terminal.
 LLVM_READONLY inline bool isPrintable(unsigned char c) {
   using namespace charinfo;
-  return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT|
-                          CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0;
+  return (InfoTable[c] & (CHAR_UPPER | CHAR_LOWER | CHAR_PERIOD | CHAR_PUNCT |
+                          CHAR_DIGIT | CHAR_UNDER | CHAR_SPACE)) != 0;
 }
 
 /// Return true if this is the body character of a C preprocessing number,
@@ -175,8 +175,9 @@ LLVM_READONLY inline bool isPreprocessingNumberBody(unsigned char c) {
 /// Return true if this is the body character of a C++ raw string delimiter.
 LLVM_READONLY inline bool isRawStringDelimBody(unsigned char c) {
   using namespace charinfo;
-  return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|
-                          CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0;
+  return (InfoTable[c] & (CHAR_UPPER | CHAR_LOWER | CHAR_PERIOD | CHAR_DIGIT |
+                          CHAR_UNDER | CHAR_PUNCT)) != 0 &&
+         c != '(' && c != ')';
 }
 
 enum class EscapeChar {
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 5a4551a96ca4e7..25fbfe83fa2bcf 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -111,6 +111,14 @@ def warn_cxx98_compat_raw_string_literal : Warning<
   "raw string literals are incompatible with C++98">,
   InGroup<CXX98Compat>, DefaultIgnore;
 
+def warn_cxx26_compat_raw_string_literal_character_set : Warning<
+  " '%0' in a raw string literal delimiter is incompatible "
+  "with standards before C++2c">,
+  InGroup<CXXPre26Compat>, DefaultIgnore;
+def ext_cxx26_raw_string_literal_character_set : Extension<
+  " '%0' in a raw string literal delimiter is a C++2c extension">,
+  InGroup<CXX26>, DefaultIgnore;
+
 def warn_multichar_character_literal : Warning<
   "multi-character character constant">, InGroup<MultiChar>;
 def warn_four_char_character_literal : Warning<
diff --git a/clang/lib/Basic/CharInfo.cpp b/clang/lib/Basic/CharInfo.cpp
index d02054c9718f5f..26d693b8e9b943 100644
--- a/clang/lib/Basic/CharInfo.cpp
+++ b/clang/lib/Basic/CharInfo.cpp
@@ -31,20 +31,20 @@ const uint16_t clang::charinfo::InfoTable[256] = {
   0           , 0           , 0           , 0           ,
   //32 SP         33  !         34  "         35  #
   //36  $         37  %         38  &         39  '
-  CHAR_SPACE  , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-  CHAR_PUNCT  , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+  CHAR_SPACE  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  ,
   //40  (         41  )         42  *         43  +
   //44  ,         45  -         46  .         47  /
-  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_RAWDEL , CHAR_RAWDEL ,
-  CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PERIOD , CHAR_PUNCT  ,
   //48  0         49  1         50  2         51  3
   //52  4         53  5         54  6         55  7
   CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  ,
   CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  ,
   //56  8         57  9         58  :         59  ;
   //60  <         61  =         62  >         63  ?
-  CHAR_DIGIT  , CHAR_DIGIT  , CHAR_RAWDEL , CHAR_RAWDEL ,
-  CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+  CHAR_DIGIT  , CHAR_DIGIT  , CHAR_PUNCT  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  ,
   //64  @         65  A         66  B         67  C
   //68  D         69  E         70  F         71  G
   CHAR_PUNCT  , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER ,
@@ -59,8 +59,8 @@ const uint16_t clang::charinfo::InfoTable[256] = {
   CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
   //88  X         89  Y         90  Z         91  [
   //92  \         93  ]         94  ^         95  _
-  CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_RAWDEL ,
-  CHAR_PUNCT  , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER  ,
+  CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_UNDER  ,
   //96  `         97  a         98  b         99  c
   //100  d       101  e        102  f        103  g
   CHAR_PUNCT  , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER ,
@@ -75,6 +75,6 @@ const uint16_t clang::charinfo::InfoTable[256] = {
   CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
   //120  x       121  y        122  z        123  {
   //124  |       125  }        126  ~        127 DEL
-  CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_RAWDEL ,
-  CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
+  CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , 0
 };
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index c98645993abe07..c7543a48c0b50e 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2261,8 +2261,17 @@ bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
 
   unsigned PrefixLen = 0;
 
-  while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
+  while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {
     ++PrefixLen;
+    if (!isLexingRawMode() &&
+        llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
+      const char *Pos = &CurPtr[PrefixLen];
+      Diag(Pos, LangOpts.CPlusPlus26
+                    ? diag::warn_cxx26_compat_raw_string_literal_character_set
+                    : diag::ext_cxx26_raw_string_literal_character_set)
+          << StringRef(Pos, 1);
+    }
+  }
 
   // If the last character was not a '(', then we didn't lex a valid delimiter.
   if (CurPtr[PrefixLen] != '(') {
diff --git a/clang/test/Lexer/cxx2c-raw-strings.cpp b/clang/test/Lexer/cxx2c-raw-strings.cpp
new file mode 100644
index 00000000000000..569a4b8447e57d
--- /dev/null
+++ b/clang/test/Lexer/cxx2c-raw-strings.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -Wc++26-extensions %s
+// RUN: %clang_cc1 -std=c++2c -fsyntax-only -verify=cxx26 -Wpre-c++26-compat %s
+
+int main() {
+  (void) R"abc`@$(foobar)abc`@$";
+  //expected-warning@-1 {{'`' in a raw string literal delimiter is a C++2c extension}}
+  //expected-warning@-2 {{'@' in a raw string literal delimiter is a C++2c extension}}
+  //expected-warning@-3 {{'$' in a raw string literal delimiter is a C++2c extension}}
+  //cxx26-warning@-4 {{'`' in a raw string literal delimiter is incompatible with standards before C++2c}}
+  //cxx26-warning@-5 {{'@' in a raw string literal delimiter is incompatible with standards before C++2c}}
+  //cxx26-warning@-6 {{'$' in a raw string literal delimiter is incompatible with standards before C++2c}}
+}

From 57790db07c5a70b557d9e0cc88d8cda417b2f30d Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 28 May 2024 13:45:52 +0000
Subject: [PATCH 14/89] [gn build] Port 23e1ed65c2c3

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index b642b2c82e6d8d..6bd56dd4117b03 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -384,6 +384,9 @@ if (current_toolchain == default_toolchain) {
       "__concepts/totally_ordered.h",
       "__condition_variable/condition_variable.h",
       "__config",
+      "__configuration/abi.h",
+      "__configuration/compiler.h",
+      "__configuration/platform.h",
       "__coroutine/coroutine_handle.h",
       "__coroutine/coroutine_traits.h",
       "__coroutine/noop_coroutine_handle.h",

From 46a30dfdfd765021a76c927f70f95024d30786f2 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Thu, 23 May 2024 14:09:10 +0000
Subject: [PATCH 15/89] Reland "[AArch64] NFC: Add RUN lines for
 streaming-compatible code." (#91599)

This reverts commit aa9d467abaeb440dc70b64c0f35b8d5e731f3a19.
---
 ...streaming-mode-fixed-length-and-combine.ll |   83 +
 ...treaming-mode-fixed-length-bit-counting.ll |  457 +++
 ...sve-streaming-mode-fixed-length-bitcast.ll |   97 +
 ...e-streaming-mode-fixed-length-bitselect.ll |   12 +
 ...treaming-mode-fixed-length-build-vector.ll |   88 +
 .../sve-streaming-mode-fixed-length-concat.ll |  228 ++
 ...e-streaming-mode-fixed-length-ext-loads.ll |  138 +
 ...ing-mode-fixed-length-extract-subvector.ll |  136 +
 ...ng-mode-fixed-length-extract-vector-elt.ll |   53 +
 ...e-streaming-mode-fixed-length-fcopysign.ll |  177 ++
 ...ve-streaming-mode-fixed-length-fp-arith.ll |  989 +++++++
 ...streaming-mode-fixed-length-fp-compares.ll | 2486 +++++++++++++++++
 ...-streaming-mode-fixed-length-fp-convert.ll |   12 +
 ...aming-mode-fixed-length-fp-extend-trunc.ll |  278 ++
 .../sve-streaming-mode-fixed-length-fp-fma.ll |  116 +
 ...e-streaming-mode-fixed-length-fp-minmax.ll |  965 +++++++
 ...eaming-mode-fixed-length-fp-reduce-fa64.ll |   25 +
 ...e-streaming-mode-fixed-length-fp-reduce.ll | 1058 +++++++
 ...streaming-mode-fixed-length-fp-rounding.ll |  547 ++++
 ...e-streaming-mode-fixed-length-fp-select.ll |   99 +
 ...e-streaming-mode-fixed-length-fp-to-int.ll |  925 ++++++
 ...-streaming-mode-fixed-length-fp-vselect.ll |  199 ++
 ...ing-mode-fixed-length-insert-vector-elt.ll |  172 ++
 ...e-streaming-mode-fixed-length-int-arith.ll |  371 +++
 ...treaming-mode-fixed-length-int-compares.ll |  154 +
 ...sve-streaming-mode-fixed-length-int-div.ll | 1145 ++++++++
 ...streaming-mode-fixed-length-int-extends.ll |  763 +++++
 ...eaming-mode-fixed-length-int-immediates.ll |  546 ++++
 ...sve-streaming-mode-fixed-length-int-log.ll |  229 ++
 ...-streaming-mode-fixed-length-int-minmax.ll |  325 +++
 ...ing-mode-fixed-length-int-mla-neon-fa64.ll |    7 +
 ...ve-streaming-mode-fixed-length-int-mulh.ll |  291 ++
 ...-streaming-mode-fixed-length-int-reduce.ll |  415 +++
 ...sve-streaming-mode-fixed-length-int-rem.ll | 1631 +++++++++++
 ...-streaming-mode-fixed-length-int-select.ll |  137 +
 ...-streaming-mode-fixed-length-int-shifts.ll |  313 +++
 ...e-streaming-mode-fixed-length-int-to-fp.ll |  822 ++++++
 ...streaming-mode-fixed-length-int-vselect.ll |  123 +
 ...reaming-mode-fixed-length-limit-duplane.ll |   27 +
 .../sve-streaming-mode-fixed-length-loads.ll  |  127 +
 ...-streaming-mode-fixed-length-log-reduce.ll |  436 +++
 ...streaming-mode-fixed-length-masked-load.ll |  954 +++++++
 ...treaming-mode-fixed-length-masked-store.ll |  774 +++++
 ...eaming-mode-fixed-length-optimize-ptrue.ll |  216 ++
 ...streaming-mode-fixed-length-permute-rev.ll |  127 +
 ...g-mode-fixed-length-permute-zip-uzp-trn.ll |  320 +++
 .../sve-streaming-mode-fixed-length-ptest.ll  |   72 +
 .../sve-streaming-mode-fixed-length-rev.ll    |  159 ++
 ...e-streaming-mode-fixed-length-sdiv-pow2.ll |  132 +
 ...treaming-mode-fixed-length-splat-vector.ll |  182 ++
 .../sve-streaming-mode-fixed-length-stores.ll |  136 +
 ...e-streaming-mode-fixed-length-subvector.ll |  133 +
 ...treaming-mode-fixed-length-trunc-stores.ll |   38 +
 .../sve-streaming-mode-fixed-length-trunc.ll  |  389 +++
 ...eaming-mode-fixed-length-vector-shuffle.ll |  151 +
 .../sve-streaming-mode-test-register-mov.ll   |   21 +
 56 files changed, 21006 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index e843537c10a33a..ed3222529a3bb9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -14,6 +15,12 @@ define <4 x i8> @vls_sve_and_4xi8(<4 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff000000ff0000
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i8> %b, <i8 0, i8 255, i8 0, i8 255>
  ret <4 x i8> %c
 }
@@ -27,6 +34,12 @@ define <8 x i8> @vls_sve_and_8xi8(<8 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_8xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff00
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <8 x i8> %c
 }
@@ -40,6 +53,12 @@ define <16 x i8> @vls_sve_and_16xi8(<16 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_16xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0xff00ff00ff00ff00
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <16 x i8> %c
 }
@@ -56,6 +75,13 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_32xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v2.2d, #0xff00ff00ff00ff00
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
  %b = and <32 x i8> %ap, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255,
                          i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <32 x i8> %b
@@ -73,6 +99,13 @@ define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i16> %b, <i16 0, i16 65535>
  ret <2 x i16> %c
 }
@@ -86,6 +119,12 @@ define <4 x i16> @vls_sve_and_4xi16(<4 x i16> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xffff0000ffff0000
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535>
  ret <4 x i16> %c
 }
@@ -99,6 +138,12 @@ define <8 x i16> @vls_sve_and_8xi16(<8 x i16> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_8xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffff0000ffff0000
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <8 x i16> %c
 }
@@ -115,6 +160,13 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_16xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffff0000ffff0000
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <16 x i16> %c
 }
@@ -128,6 +180,13 @@ define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_2xi32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i32> %b, <i32 0, i32 4294967295>
  ret <2 x i32> %c
 }
@@ -141,6 +200,12 @@ define <4 x i32> @vls_sve_and_4xi32(<4 x i32> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffffffff00000000
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <4 x i32> %c
 }
@@ -157,6 +222,13 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_8xi32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffffffff00000000
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <8 x i32> %c
 }
@@ -170,6 +242,11 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_2xi64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
+; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i64> %b, <i64 0, i64 18446744073709551615>
  ret <2 x i64> %c
 }
@@ -185,6 +262,12 @@ define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
+; NONEON-NOSVE-NEXT:    mov v1.d[0], xzr
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i64> %b, <i64 0, i64 18446744073709551615, i64 0, i64 18446744073709551615>
  ret <4 x i64> %c
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index aa42d5c2a8c132..cd6c2b489efe4c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,16 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    sub z0.h, z0.h, #8 // =0x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    mov w8, #8 // =0x8
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -30,6 +41,11 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -42,6 +58,11 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -55,6 +76,14 @@ define void @ctlz_v32i8(ptr %a) {
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    clz v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -71,6 +100,16 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    sub z0.s, z0.s, #16 // =0x10
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -83,6 +122,11 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -95,6 +139,11 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -108,6 +157,14 @@ define void @ctlz_v16i16(ptr %a) {
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -122,6 +179,11 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -134,6 +196,11 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -147,6 +214,14 @@ define void @ctlz_v8i32(ptr %a) {
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -161,6 +236,27 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #1
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #2
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #4
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #8
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #16
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #32
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    mvn v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -173,6 +269,27 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #1
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #2
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #4
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #8
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #16
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #32
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -186,6 +303,46 @@ define void @ctlz_v4i64(ptr %a) {
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #1
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #1
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #2
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #2
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #4
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #4
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #8
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #8
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #16
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #16
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #32
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #32
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -205,6 +362,14 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -217,6 +382,11 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -229,6 +399,11 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -242,6 +417,14 @@ define void @ctpop_v32i8(ptr %a) {
 ; CHECK-NEXT:    cnt z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -257,6 +440,15 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -269,6 +461,12 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -281,6 +479,12 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -294,6 +498,16 @@ define void @ctpop_v16i16(ptr %a) {
 ; CHECK-NEXT:    cnt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -308,6 +522,13 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -320,6 +541,13 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -333,6 +561,18 @@ define void @ctpop_v8i32(ptr %a) {
 ; CHECK-NEXT:    cnt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -347,6 +587,14 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -359,6 +607,14 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -372,6 +628,20 @@ define void @ctpop_v4i64(ptr %a) {
 ; CHECK-NEXT:    cnt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -392,6 +662,21 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #256 // =0x100
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v2.4h
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -405,6 +690,14 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.8b, #1
+; NONEON-NOSVE-NEXT:    sub v1.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -418,6 +711,14 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.16b, #1
+; NONEON-NOSVE-NEXT:    sub v1.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -433,6 +734,19 @@ define void @cttz_v32i8(ptr %a) {
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #1
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v3.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -449,6 +763,21 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #65536 // =0x10000
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v2.2s
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -462,6 +791,18 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -475,6 +816,18 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.8h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    sub v1.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    dup v1.8h, w8
+; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -490,6 +843,24 @@ define void @cttz_v16i16(ptr %a) {
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    sub v3.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    dup v2.8h, w8
+; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -505,6 +876,18 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
+; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -518,6 +901,18 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.4s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
+; NONEON-NOSVE-NEXT:    sub v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    dup v1.4s, w8
+; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -533,6 +928,24 @@ define void @cttz_v8i32(ptr %a) {
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
+; NONEON-NOSVE-NEXT:    sub v3.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    dup v2.4s, w8
+; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -548,6 +961,18 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    sub d1, d0, d1
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -561,6 +986,18 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.2d, x8
+; NONEON-NOSVE-NEXT:    sub v1.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -576,6 +1013,26 @@ define void @cttz_v4i64(ptr %a) {
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    sub v3.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 260ad16581f139..7e93ee99ed7494 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,6 +12,12 @@ define void @bitcast_v4i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i8>, ptr %a
   %cast = bitcast <4 x i8> %load to <4 x i8>
   store volatile <4 x i8> %cast, ptr %b
@@ -23,6 +30,12 @@ define void @bitcast_v8i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i8>, ptr %a
   %cast = bitcast <8 x i8> %load to <8 x i8>
   store volatile <8 x i8> %cast, ptr %b
@@ -35,6 +48,12 @@ define void @bitcast_v16i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <16 x i8>, ptr %a
   %cast = bitcast <16 x i8> %load to <16 x i8>
   store volatile <16 x i8> %cast, ptr %b
@@ -49,6 +68,14 @@ define void @bitcast_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <32 x i8>, ptr %a
   %cast = bitcast <32 x i8> %load to <32 x i8>
   store volatile <32 x i8> %cast, ptr %b
@@ -72,6 +99,16 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    add x8, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i16>, ptr %a
   %cast = bitcast <2 x i16> %load to <2 x half>
   store volatile <2 x half> %cast, ptr %b
@@ -84,6 +121,12 @@ define void @bitcast_v4i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i16>, ptr %a
   %cast = bitcast <4 x i16> %load to <4 x half>
   store volatile <4 x half> %cast, ptr %b
@@ -96,6 +139,12 @@ define void @bitcast_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i16>, ptr %a
   %cast = bitcast <8 x i16> %load to <8 x half>
   store volatile <8 x half> %cast, ptr %b
@@ -110,6 +159,14 @@ define void @bitcast_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <16 x i16>, ptr %a
   %cast = bitcast <16 x i16> %load to <16 x half>
   store volatile <16 x half> %cast, ptr %b
@@ -122,6 +179,12 @@ define void @bitcast_v2i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i32>, ptr %a
   %cast = bitcast <2 x i32> %load to <2 x float>
   store volatile <2 x float> %cast, ptr %b
@@ -134,6 +197,12 @@ define void @bitcast_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i32>, ptr %a
   %cast = bitcast <4 x i32> %load to <4 x float>
   store volatile <4 x float> %cast, ptr %b
@@ -148,6 +217,14 @@ define void @bitcast_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i32>, ptr %a
   %cast = bitcast <8 x i32> %load to <8 x float>
   store volatile <8 x float> %cast, ptr %b
@@ -160,6 +237,12 @@ define void @bitcast_v1i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <1 x i64>, ptr %a
   %cast = bitcast <1 x i64> %load to <1 x double>
   store volatile <1 x double> %cast, ptr %b
@@ -172,6 +255,12 @@ define void @bitcast_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i64>, ptr %a
   %cast = bitcast <2 x i64> %load to <2 x double>
   store volatile <2 x double> %cast, ptr %b
@@ -186,6 +275,14 @@ define void @bitcast_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i64>, ptr %a
   %cast = bitcast <4 x i64> %load to <4 x double>
   store volatile <4 x double> %cast, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index 9a07bd8bd5ac9f..6b8077053b590f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64"
 
@@ -30,6 +31,17 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x2]
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    ret
   %pre_cond = load <8 x i32>, ptr %pre_cond_ptr
   %left = load <8 x i32>, ptr %left_ptr
   %right = load <8 x i32>, ptr %right_ptr
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index aec434b4819d70..318a9cf7d738b2 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -10,6 +11,12 @@ define void @build_vector_7_inc1_v4i1(ptr %a) {
 ; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    strb w8, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_7_inc1_v4i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    strb w8, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i1> <i1 true, i1 false, i1 true, i1 false>, ptr %a, align 1
   ret void
 }
@@ -23,6 +30,15 @@ define void @build_vector_7_inc1_v32i8(ptr %a) {
 ; CHECK-NEXT:    add z1.b, z1.b, #23 // =0x17
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_7_inc1_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI1_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI1_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38>, ptr %a, align 1
   ret void
 }
@@ -35,6 +51,15 @@ define void @build_vector_0_inc2_v16i16(ptr %a) {
 ; CHECK-NEXT:    add z0.h, z0.h, #16 // =0x10
 ; CHECK-NEXT:    str q0, [x0, #16]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI2_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI2_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> <i16 0, i16 2, i16 4, i16 6, i16 8, i16 10, i16 12, i16 14, i16 16, i16 18, i16 20, i16 22, i16 24, i16 26, i16 28, i16 30>, ptr %a, align 2
   ret void
 }
@@ -48,6 +73,15 @@ define void @build_vector_0_dec3_v8i32(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z0.s, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI3_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI3_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> <i32 0, i32 -3, i32 -6, i32 -9, i32 -12, i32 -15, i32 -18, i32 -21>, ptr %a, align 4
   ret void
 }
@@ -64,6 +98,15 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) {
 ; CHECK-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI4_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI4_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> <i64 -2, i64 -34, i64 -66, i64 -98>, ptr %a, align 8
   ret void
 }
@@ -76,6 +119,15 @@ define void @build_vector_no_stride_v4i64(ptr %a) {
 ; CHECK-NEXT:    index z1.d, #0, #4
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_no_stride_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI5_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI5_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> <i64 0, i64 4, i64 1, i64 8>, ptr %a, align 8
   ret void
 }
@@ -89,6 +141,15 @@ define void @build_vector_0_inc2_v16f16(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI6_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI6_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI6_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x half> <half 0.0, half 2.0, half 4.0, half 6.0, half 8.0, half 10.0, half 12.0, half 14.0, half 16.0, half 18.0, half 20.0, half 22.0, half 24.0, half 26.0, half 28.0, half 30.0>, ptr %a, align 2
   ret void
 }
@@ -103,6 +164,15 @@ define void @build_vector_0_dec3_v8f32(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI7_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI7_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI7_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x float> <float 0.0, float -3.0, float -6.0, float -9.0, float -12.0, float -15.0, float -18.0, float -21.0>, ptr %a, align 4
   ret void
 }
@@ -117,6 +187,15 @@ define void @build_vector_minus2_dec32_v4f64(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI8_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI8_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x double> <double -2.0, double -34.0, double -66.0, double -98.0>, ptr %a, align 8
   ret void
 }
@@ -131,6 +210,15 @@ define void @build_vector_no_stride_v4f64(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI9_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_no_stride_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI9_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI9_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 82e75d6efda352..d2bfc7d4e80969 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -40,6 +41,11 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
 }
@@ -53,6 +59,13 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
@@ -65,6 +78,13 @@ define void @concat_v32i8(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
   %res = shufflevector <16 x i8> %op1, <16 x i8> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -83,6 +103,14 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = shufflevector <32 x i8> %op1, <32 x i8> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -121,6 +149,11 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i16> %res
 }
@@ -135,6 +168,13 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -146,6 +186,13 @@ define void @concat_v16i16(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %res = shufflevector <8 x i16> %op1, <8 x i16> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -162,6 +209,14 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = shufflevector <16 x i16> %op1, <16 x i16> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -185,6 +240,11 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2)  {
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i32> %res
 }
@@ -199,6 +259,13 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
@@ -210,6 +277,13 @@ define void @concat_v8i32(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
   %res = shufflevector <4 x i32> %op1, <4 x i32> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -225,6 +299,14 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = shufflevector <8 x i32> %op1, <8 x i32> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -247,6 +329,13 @@ define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2)  {
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %res
 }
@@ -258,6 +347,13 @@ define void @concat_v4i64(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %res = shufflevector <2 x i64> %op1, <2 x i64> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -273,6 +369,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = shufflevector <4 x i64> %op1, <4 x i64> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -300,6 +404,11 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x half> %res
 }
@@ -313,6 +422,13 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
 }
@@ -324,6 +440,13 @@ define void @concat_v16f16(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
   %res = shufflevector <8 x half> %op1, <8 x half> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -340,6 +463,14 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = shufflevector <16 x half> %op1, <16 x half> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -363,6 +494,11 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2)  {
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %res
 }
@@ -377,6 +513,13 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
@@ -388,6 +531,13 @@ define void @concat_v8f32(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
   %res = shufflevector <4 x float> %op1, <4 x float> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -403,6 +553,14 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = shufflevector <8 x float> %op1, <8 x float> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -425,6 +583,13 @@ define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2)  {
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %res
 }
@@ -436,6 +601,13 @@ define void @concat_v4f64(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
   %res = shufflevector <2 x double> %op1, <2 x double> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -451,6 +623,14 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = shufflevector <4 x double> %op1, <4 x double> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -468,6 +648,12 @@ define void @concat_v32i8_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i8_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %res = shufflevector <16 x i8> %op1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
@@ -483,6 +669,12 @@ define void @concat_v16i16_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i16_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = shufflevector <8 x i16> %op1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -496,6 +688,12 @@ define void @concat_v8i32_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = shufflevector <4 x i32> %op1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %res, ptr %b
@@ -508,6 +706,12 @@ define void @concat_v4i64_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i64_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = shufflevector <2 x i64> %op1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i64> %res, ptr %b
@@ -524,6 +728,12 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i8_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -541,6 +751,12 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i16_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %res = shufflevector <8 x i16> %shuffle, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -555,6 +771,12 @@ define void @concat_v8i32_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i32_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res = shufflevector <4 x i32> %shuffle, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -568,6 +790,12 @@ define void @concat_v4i64_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i64_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x i64>, ptr %a
   %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
   %res = shufflevector <2 x i64> %shuffle, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 040e5861e98101..728b85d39bb37f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,6 +12,12 @@ define <8 x i16> @load_zext_v8i8i16(ptr %ap)  {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v8i8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %ap
   %val = zext <8 x i8> %a to <8 x i16>
   ret <8 x i16> %val
@@ -23,6 +30,12 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap)  {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v4i16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = zext <4 x i16> %a to <4 x i32>
   ret <4 x i32> %val
@@ -35,6 +48,12 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) {
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v2i32i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %ap
   %val = zext <2 x i32> %a to <2 x i64>
   ret <2 x i64> %val
@@ -54,6 +73,19 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) {
 ; CHECK-NEXT:    mov x7, xzr
 ; CHECK-NEXT:    fmov x4, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v2i64i256:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    mov x1, xzr
+; NONEON-NOSVE-NEXT:    mov x2, xzr
+; NONEON-NOSVE-NEXT:    mov x3, xzr
+; NONEON-NOSVE-NEXT:    mov x5, xzr
+; NONEON-NOSVE-NEXT:    mov x6, xzr
+; NONEON-NOSVE-NEXT:    mov x4, v0.d[1]
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    mov x7, xzr
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = zext <2 x i64> %a to <2 x i256>
   ret <2 x i256> %val
@@ -75,6 +107,24 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $z2
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $z3
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v16i8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v2.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %ap
   %val = sext <16 x i8> %a to <16 x i32>
   ret <16 x i32> %val
@@ -90,6 +140,17 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v8i16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = sext <8 x i16> %a to <8 x i32>
   ret <8 x i32> %val
@@ -121,6 +182,39 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; CHECK-NEXT:    stp x12, x12, [x8, #112]
 ; CHECK-NEXT:    stp x11, x12, [x8, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x10, x8, #32
+; NONEON-NOSVE-NEXT:    add x11, x8, #96
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
+; NONEON-NOSVE-NEXT:    st1 { v0.d }[1], [x10]
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    st1 { v1.d }[1], [x11]
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    asr x10, x10, #63
+; NONEON-NOSVE-NEXT:    str d0, [x8]
+; NONEON-NOSVE-NEXT:    asr x9, x9, #63
+; NONEON-NOSVE-NEXT:    str d1, [x8, #64]
+; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #48]
+; NONEON-NOSVE-NEXT:    str x9, [x8, #40]
+; NONEON-NOSVE-NEXT:    fmov x9, d1
+; NONEON-NOSVE-NEXT:    str x10, [x8, #8]
+; NONEON-NOSVE-NEXT:    asr x10, x11, #63
+; NONEON-NOSVE-NEXT:    asr x9, x9, #63
+; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #112]
+; NONEON-NOSVE-NEXT:    str x10, [x8, #104]
+; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #80]
+; NONEON-NOSVE-NEXT:    str x9, [x8, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = sext <4 x i32> %a to <4 x i256>
   ret <4 x i256> %val
@@ -154,6 +248,22 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 ; CHECK-NEXT:    fmov x1, d6
 ; CHECK-NEXT:    fmov x5, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v2i64i256:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    dup v1.2d, v0.d[1]
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    asr x1, x0, #63
+; NONEON-NOSVE-NEXT:    asr x5, x8, #63
+; NONEON-NOSVE-NEXT:    mov x2, x1
+; NONEON-NOSVE-NEXT:    mov x3, x1
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x5
+; NONEON-NOSVE-NEXT:    mov x6, x5
+; NONEON-NOSVE-NEXT:    mov x7, x5
+; NONEON-NOSVE-NEXT:    fmov x4, d1
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = sext <2 x i64> %a to <2 x i256>
   ret <2 x i256> %val
@@ -187,6 +297,34 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q6 killed $q6 killed $z6
 ; CHECK-NEXT:    // kill: def $q7 killed $q7 killed $z7
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v16i16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v5.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d16, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d17, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v6.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v16.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v7.2d, v17.2s, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %ap
   %val = zext <16 x i16> %a to <16 x i64>
   ret <16 x i64> %val
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 45a804becbc557..ec6341d6085a0a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -27,6 +28,11 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> %op, i64 4)
   ret <4 x i1> %ret
 }
@@ -54,6 +60,11 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4)
   ret <4 x i8> %ret
 }
@@ -65,6 +76,14 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> %op, i64 8)
   ret <8 x i8> %ret
 }
@@ -75,6 +94,12 @@ define void @extract_subvector_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %ret = call <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8> %op, i64 16)
   store <16 x i8> %ret, ptr %b
@@ -91,6 +116,15 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2)
   ret <2 x i16> %ret
 }
@@ -102,6 +136,14 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %op, i64 4)
   ret <4 x i16> %ret
 }
@@ -112,6 +154,12 @@ define void @extract_subvector_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %ret = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> %op, i64 8)
   store <8 x i16> %ret, ptr %b
@@ -127,6 +175,12 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
   ret <1 x i32> %ret
 }
@@ -138,6 +192,14 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %op, i64 2)
   ret <2 x i32> %ret
 }
@@ -148,6 +210,12 @@ define void @extract_subvector_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %ret = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %op, i64 4)
   store <4 x i32> %ret, ptr %b
@@ -163,6 +231,14 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %op, i64 1)
   ret <1 x i64> %ret
 }
@@ -173,6 +249,12 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %ret = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> %op, i64 2)
   store <2 x i64> %ret, ptr %b
@@ -190,6 +272,12 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2)
   ret <2 x half> %ret
 }
@@ -201,6 +289,14 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half> %op, i64 4)
   ret <4 x half> %ret
 }
@@ -211,6 +307,12 @@ define void @extract_subvector_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %ret = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> %op, i64 8)
   store <8 x half> %ret, ptr %b
@@ -226,6 +328,12 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
   ret <1 x float> %ret
 }
@@ -237,6 +345,14 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> %op, i64 2)
   ret <2 x float> %ret
 }
@@ -247,6 +363,12 @@ define void @extract_subvector_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %ret = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> %op, i64 4)
   store <4 x float> %ret, ptr %b
@@ -262,6 +384,14 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double> %op, i64 1)
   ret <1 x double> %ret
 }
@@ -272,6 +402,12 @@ define void @extract_subvector_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %ret = call <2 x double> @llvm.vector.extract.v2f64.v4f64(<4 x double> %op, i64 2)
   store <2 x double> %ret, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index 9c3b5e14289dc1..ac60a614d7ce6c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,6 +16,12 @@ define half @extractelement_v2f16(<2 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x half> %op1, i64 1
   ret half %r
 }
@@ -26,6 +33,12 @@ define half @extractelement_v4f16(<4 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[3]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x half> %op1, i64 3
   ret half %r
 }
@@ -37,6 +50,11 @@ define half @extractelement_v8f16(<8 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <8 x half> %op1, i64 7
   ret half %r
 }
@@ -48,6 +66,11 @@ define half @extractelement_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0, #30]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %r = extractelement <16 x half> %op1, i64 15
   ret half %r
@@ -60,6 +83,12 @@ define float @extractelement_v2f32(<2 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov s0, v0.s[1]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x float> %op1, i64 1
   ret float %r
 }
@@ -71,6 +100,11 @@ define float @extractelement_v4f32(<4 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov s0, v0.s[3]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x float> %op1, i64 3
   ret float %r
 }
@@ -82,6 +116,11 @@ define float @extractelement_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0, #28]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %r = extractelement <8 x float> %op1, i64 7
   ret float %r
@@ -91,6 +130,10 @@ define double @extractelement_v1f64(<1 x double> %op1) {
 ; CHECK-LABEL: extractelement_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <1 x double> %op1, i64 0
   ret double %r
 }
@@ -101,6 +144,11 @@ define double @extractelement_v2f64(<2 x double> %op1) {
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov d0, v0.d[1]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x double> %op1, i64 1
   ret double %r
 }
@@ -112,6 +160,11 @@ define double @extractelement_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0, #24]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %r = extractelement <4 x double> %op1, i64 3
   ret double %r
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index 21ce689f68e23a..c1d84f6a15ed8c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
@@ -28,6 +29,16 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    ldr d2, [x1]
+; NONEON-NOSVE-NEXT:    dup v0.4h, w8
+; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x half>, ptr %bp
   %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b)
@@ -54,6 +65,16 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x half>, ptr %bp
   %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
@@ -84,6 +105,17 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v16f16_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %ap
   %b = load <16 x half>, ptr %bp
   %r = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b)
@@ -112,6 +144,16 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    ldr d2, [x1]
+; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x float>, ptr %bp
   %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
@@ -138,6 +180,16 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
@@ -168,6 +220,17 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v8f32_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>, ptr %ap
   %b = load <8 x float>, ptr %bp
   %r = call <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b)
@@ -196,6 +259,16 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load <2 x double>, ptr %bp
   %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b)
@@ -226,6 +299,17 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b)
@@ -260,6 +344,17 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr d2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
+; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    bsl v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x double>, ptr %bp
   %tmp0 = fptrunc <2 x double> %b to <2 x float>
@@ -304,6 +399,18 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v2.2d
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %tmp0 = fptrunc <4 x double> %b to <4 x float>
@@ -337,6 +444,17 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load < 2 x float>, ptr %bp
   %tmp0 = fpext <2 x float> %b to <2 x double>
@@ -381,6 +499,23 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z4.d, z4.d, z1.d, z2.d
 ; SVE2-NEXT:    stp q3, q4, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
+; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %tmp0 = fpext <4 x float> %b to <4 x double>
@@ -416,6 +551,17 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldr d2, [x0]
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %tmp0 = fptrunc <4 x float> %b to <4 x half>
@@ -471,6 +617,25 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    str d5, [x0]
 ; SVE2-NEXT:    add sp, sp, #16
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x1]
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    mov d1, v0.d[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    fcvt h1, d1
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, d2
+; NONEON-NOSVE-NEXT:    mov d2, v2.d[1]
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, d2
+; NONEON-NOSVE-NEXT:    ldr d2, [x0]
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %tmp0 = fptrunc <4 x double> %b to <4 x half>
@@ -514,6 +679,18 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    dup v1.8h, w8
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x float>, ptr %bp
   %tmp0 = fptrunc <8 x float> %b to <8 x half>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index b0a82e699939f1..b51b89d08844d0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,14 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -30,6 +39,14 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -43,6 +60,18 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -58,6 +87,29 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fadd <16 x half> %op1, %op2
@@ -74,6 +126,11 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -87,6 +144,11 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -102,6 +164,15 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fadd <8 x float> %op1, %op2
@@ -118,6 +189,11 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -133,6 +209,15 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fadd <4 x double> %op1, %op2
@@ -153,6 +238,14 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -166,6 +259,14 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -179,6 +280,18 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -194,6 +307,30 @@ define void @fdiv_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v5.4s, v4.8h
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v4.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ldr q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl2 v6.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fdiv v3.4s, v3.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fdiv v5.4s, v6.4s, v5.4s
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fdiv <16 x half> %op1, %op2
@@ -210,6 +347,11 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fdiv v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -223,6 +365,11 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -238,6 +385,15 @@ define void @fdiv_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fdiv v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fdiv <8 x float> %op1, %op2
@@ -254,6 +410,11 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fdiv v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -269,6 +430,15 @@ define void @fdiv_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fdiv v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fdiv v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fdiv <4 x double> %op1, %op2
@@ -290,6 +460,46 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    fcvt s16, h0
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt s4, h17
+; NONEON-NOSVE-NEXT:    fcvt s5, h18
+; NONEON-NOSVE-NEXT:    fcvt h0, s6
+; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h16
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
   ret <2 x half> %res
 }
@@ -304,6 +514,46 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    fcvt s16, h0
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt s4, h17
+; NONEON-NOSVE-NEXT:    fcvt s5, h18
+; NONEON-NOSVE-NEXT:    fcvt h0, s6
+; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h16
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
   ret <4 x half> %res
 }
@@ -318,6 +568,79 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    fcvt s16, h0
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s7, h17
+; NONEON-NOSVE-NEXT:    fcvt s16, h18
+; NONEON-NOSVE-NEXT:    fcvt s17, h19
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmadd s4, s5, s4, s3
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fmadd s6, s17, s16, s7
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s7, h18
+; NONEON-NOSVE-NEXT:    fcvt s16, h19
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov v3.h[1], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    fmadd s5, s16, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    mov v3.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fmadd s17, s19, s18, s17
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmadd s4, s16, s7, s4
+; NONEON-NOSVE-NEXT:    mov v3.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h18
+; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    fcvt h16, s17
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fmadd s5, s7, s6, s5
+; NONEON-NOSVE-NEXT:    mov v3.h[4], v16.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
+; NONEON-NOSVE-NEXT:    mov v3.h[5], v4.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h4, s5
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v3.h[6], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov v3.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
   ret <8 x half> %res
 }
@@ -334,6 +657,150 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [x2]
+; NONEON-NOSVE-NEXT:    mov h25, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s18, h1
+; NONEON-NOSVE-NEXT:    mov h22, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    mov h20, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h26, v5.h[1]
+; NONEON-NOSVE-NEXT:    mov h27, v4.h[1]
+; NONEON-NOSVE-NEXT:    mov h28, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s25, h25
+; NONEON-NOSVE-NEXT:    mov h7, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h29, v4.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s23, h17
+; NONEON-NOSVE-NEXT:    mov h17, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h30, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s21, h16
+; NONEON-NOSVE-NEXT:    fmadd s6, s19, s18, s6
+; NONEON-NOSVE-NEXT:    fcvt s18, h20
+; NONEON-NOSVE-NEXT:    fcvt s19, h22
+; NONEON-NOSVE-NEXT:    fcvt s20, h24
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s22, h5
+; NONEON-NOSVE-NEXT:    fcvt s24, h4
+; NONEON-NOSVE-NEXT:    fcvt s26, h26
+; NONEON-NOSVE-NEXT:    fcvt s27, h27
+; NONEON-NOSVE-NEXT:    fcvt s28, h28
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fmadd s21, s25, s23, s21
+; NONEON-NOSVE-NEXT:    fcvt s23, h3
+; NONEON-NOSVE-NEXT:    mov h25, v5.h[2]
+; NONEON-NOSVE-NEXT:    fmadd s18, s20, s19, s18
+; NONEON-NOSVE-NEXT:    mov h19, v3.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    mov h31, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
+; NONEON-NOSVE-NEXT:    mov h27, v4.h[3]
+; NONEON-NOSVE-NEXT:    mov h28, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmadd s22, s23, s24, s22
+; NONEON-NOSVE-NEXT:    fcvt h20, s21
+; NONEON-NOSVE-NEXT:    mov h21, v2.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt s24, h29
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fmadd s16, s17, s16, s7
+; NONEON-NOSVE-NEXT:    mov h25, v5.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt h26, s26
+; NONEON-NOSVE-NEXT:    mov h29, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov v6.h[1], v20.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s17, h21
+; NONEON-NOSVE-NEXT:    fcvt s20, h30
+; NONEON-NOSVE-NEXT:    fmadd s19, s19, s24, s23
+; NONEON-NOSVE-NEXT:    fcvt s21, h31
+; NONEON-NOSVE-NEXT:    fcvt h7, s22
+; NONEON-NOSVE-NEXT:    fcvt s22, h25
+; NONEON-NOSVE-NEXT:    fcvt s23, h27
+; NONEON-NOSVE-NEXT:    fcvt s24, h28
+; NONEON-NOSVE-NEXT:    mov h25, v5.h[4]
+; NONEON-NOSVE-NEXT:    mov h27, v4.h[4]
+; NONEON-NOSVE-NEXT:    mov h28, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov h30, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h31, v0.h[5]
+; NONEON-NOSVE-NEXT:    mov v6.h[2], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s17, s21, s20, s17
+; NONEON-NOSVE-NEXT:    mov v7.h[1], v26.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h18, s19
+; NONEON-NOSVE-NEXT:    fmadd s19, s24, s23, s22
+; NONEON-NOSVE-NEXT:    mov h26, v5.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s20, h25
+; NONEON-NOSVE-NEXT:    fcvt s21, h27
+; NONEON-NOSVE-NEXT:    fcvt s22, h28
+; NONEON-NOSVE-NEXT:    mov h27, v4.h[5]
+; NONEON-NOSVE-NEXT:    mov h28, v3.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s23, h29
+; NONEON-NOSVE-NEXT:    fcvt s24, h30
+; NONEON-NOSVE-NEXT:    fcvt s25, h31
+; NONEON-NOSVE-NEXT:    mov h29, v2.h[6]
+; NONEON-NOSVE-NEXT:    mov h30, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h31, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov v7.h[2], v18.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h18, s19
+; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
+; NONEON-NOSVE-NEXT:    mov h20, v5.h[6]
+; NONEON-NOSVE-NEXT:    mov h21, v4.h[6]
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s26, h26
+; NONEON-NOSVE-NEXT:    fmadd s23, s25, s24, s23
+; NONEON-NOSVE-NEXT:    fcvt s27, h27
+; NONEON-NOSVE-NEXT:    fcvt s28, h28
+; NONEON-NOSVE-NEXT:    mov v6.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s17
+; NONEON-NOSVE-NEXT:    fcvt s17, h29
+; NONEON-NOSVE-NEXT:    fcvt s24, h30
+; NONEON-NOSVE-NEXT:    fcvt s25, h31
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    fcvt s21, h21
+; NONEON-NOSVE-NEXT:    fcvt s22, h22
+; NONEON-NOSVE-NEXT:    mov v7.h[3], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
+; NONEON-NOSVE-NEXT:    fcvt h18, s19
+; NONEON-NOSVE-NEXT:    mov h5, v5.h[7]
+; NONEON-NOSVE-NEXT:    mov h4, v4.h[7]
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    fmadd s17, s25, s24, s17
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
+; NONEON-NOSVE-NEXT:    mov v6.h[4], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s23
+; NONEON-NOSVE-NEXT:    mov v7.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h18, s26
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v6.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    mov v7.h[5], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s3, s3, s4, s5
+; NONEON-NOSVE-NEXT:    fcvt h4, s19
+; NONEON-NOSVE-NEXT:    fcvt h5, s17
+; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
+; NONEON-NOSVE-NEXT:    mov v7.h[6], v4.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s3
+; NONEON-NOSVE-NEXT:    mov v6.h[6], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v6.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %op3 = load <16 x half>, ptr %c
@@ -352,6 +819,12 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
   ret <2 x float> %res
 }
@@ -366,6 +839,12 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
   ret <4 x float> %res
 }
@@ -382,6 +861,16 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %op3 = load <8 x float>, ptr %c
@@ -400,6 +889,12 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
   ret <2 x double> %res
 }
@@ -416,6 +911,16 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %op3 = load <4 x double>, ptr %c
@@ -437,6 +942,14 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -450,6 +963,14 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -463,6 +984,18 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -478,6 +1011,29 @@ define void @fmul_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fmul <16 x half> %op1, %op2
@@ -494,6 +1050,11 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -507,6 +1068,11 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -522,6 +1088,15 @@ define void @fmul_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmul v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fmul <8 x float> %op1, %op2
@@ -538,6 +1113,11 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -553,6 +1133,15 @@ define void @fmul_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmul v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmul v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fmul <4 x double> %op1, %op2
@@ -572,6 +1161,12 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x half> %op
   ret <2 x half> %res
 }
@@ -584,6 +1179,12 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x half> %op
   ret <4 x half> %res
 }
@@ -596,6 +1197,12 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <8 x half> %op
   ret <8 x half> %res
 }
@@ -609,6 +1216,15 @@ define void @fneg_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fneg z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = fneg <16 x half> %op
   store <16 x half> %res, ptr %a
@@ -623,6 +1239,11 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x float> %op
   ret <2 x float> %res
 }
@@ -635,6 +1256,11 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x float> %op
   ret <4 x float> %res
 }
@@ -648,6 +1274,14 @@ define void @fneg_v8f32(ptr %a) {
 ; CHECK-NEXT:    fneg z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fneg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = fneg <8 x float> %op
   store <8 x float> %res, ptr %a
@@ -662,6 +1296,11 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x double> %op
   ret <2 x double> %res
 }
@@ -675,6 +1314,14 @@ define void @fneg_v4f64(ptr %a) {
 ; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fneg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = fneg <4 x double> %op
   store <4 x double> %res, ptr %a
@@ -693,6 +1340,30 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsqrt s2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s1, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fsqrt s3, s3
+; NONEON-NOSVE-NEXT:    fsqrt s4, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s3
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s4
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -705,6 +1376,30 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsqrt s2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s1, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fsqrt s3, s3
+; NONEON-NOSVE-NEXT:    fsqrt s4, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s3
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s4
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -717,6 +1412,48 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsqrt s2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h0
+; NONEON-NOSVE-NEXT:    fcvt h0, s2
+; NONEON-NOSVE-NEXT:    fsqrt s1, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s3, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s3
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s4, s4
+; NONEON-NOSVE-NEXT:    fcvt h1, s4
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s5, s5
+; NONEON-NOSVE-NEXT:    fcvt h1, s5
+; NONEON-NOSVE-NEXT:    mov v0.h[4], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s6, s6
+; NONEON-NOSVE-NEXT:    fcvt h1, s6
+; NONEON-NOSVE-NEXT:    mov v0.h[5], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s7, s7
+; NONEON-NOSVE-NEXT:    fcvt h1, s7
+; NONEON-NOSVE-NEXT:    mov v0.h[6], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s2, s16
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -730,6 +1467,89 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsqrt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q16, [x0]
+; NONEON-NOSVE-NEXT:    mov h0, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h17, v16.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s18, h16
+; NONEON-NOSVE-NEXT:    mov h19, v16.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h20, v16.h[3]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h21, v16.h[4]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h22, v16.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fsqrt s2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s21, h21
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s22, h22
+; NONEON-NOSVE-NEXT:    mov h23, v16.h[6]
+; NONEON-NOSVE-NEXT:    mov h16, v16.h[7]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s23, h23
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s17, s17
+; NONEON-NOSVE-NEXT:    fcvt h17, s17
+; NONEON-NOSVE-NEXT:    fsqrt s18, s18
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    mov v18.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s3, s3
+; NONEON-NOSVE-NEXT:    fcvt h0, s3
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s19, s19
+; NONEON-NOSVE-NEXT:    fcvt h17, s19
+; NONEON-NOSVE-NEXT:    mov v18.h[2], v17.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s4, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s20, s20
+; NONEON-NOSVE-NEXT:    fcvt h3, s20
+; NONEON-NOSVE-NEXT:    mov v18.h[3], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s5, s5
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s21, s21
+; NONEON-NOSVE-NEXT:    fcvt h3, s21
+; NONEON-NOSVE-NEXT:    mov v18.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s6, s6
+; NONEON-NOSVE-NEXT:    fcvt h0, s6
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s22, s22
+; NONEON-NOSVE-NEXT:    fcvt h3, s22
+; NONEON-NOSVE-NEXT:    mov v18.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s7, s7
+; NONEON-NOSVE-NEXT:    fcvt h0, s7
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s23, s23
+; NONEON-NOSVE-NEXT:    fcvt h3, s23
+; NONEON-NOSVE-NEXT:    mov v18.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s16, s16
+; NONEON-NOSVE-NEXT:    fcvt h3, s16
+; NONEON-NOSVE-NEXT:    mov v18.h[7], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q18, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -744,6 +1564,11 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsqrt v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -756,6 +1581,11 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -769,6 +1599,14 @@ define void @fsqrt_v8f32(ptr %a) {
 ; CHECK-NEXT:    fsqrt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fsqrt v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -783,6 +1621,11 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -796,6 +1639,14 @@ define void @fsqrt_v4f64(ptr %a) {
 ; CHECK-NEXT:    fsqrt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fsqrt v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -815,6 +1666,14 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -828,6 +1687,14 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -841,6 +1708,18 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -856,6 +1735,29 @@ define void @fsub_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fsub v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fsub v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fsub v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fsub <16 x half> %op1, %op2
@@ -872,6 +1774,11 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -885,6 +1792,11 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -900,6 +1812,15 @@ define void @fsub_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fsub v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fsub <8 x float> %op1, %op2
@@ -916,6 +1837,11 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsub v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -931,6 +1857,15 @@ define void @fsub_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fsub v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fsub v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fsub <4 x double> %op1, %op2
@@ -950,6 +1885,11 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fabs.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -962,6 +1902,11 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -974,6 +1919,11 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -987,6 +1937,14 @@ define void @fabs_v16f16(ptr %a) {
 ; CHECK-NEXT:    fabs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    bic v1.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -1001,6 +1959,11 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fabs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -1013,6 +1976,11 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -1026,6 +1994,14 @@ define void @fabs_v8f32(ptr %a) {
 ; CHECK-NEXT:    fabs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fabs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -1040,6 +2016,11 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -1053,6 +2034,14 @@ define void @fabs_v4f64(ptr %a) {
 ; CHECK-NEXT:    fabs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fabs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index cbd0ad66fba767..c5ed70c8a5f2f8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,6 +20,14 @@ define <2 x i16> @fcmp_oeq_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x half> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i16>
   ret <2 x i16> %sext
@@ -34,6 +43,14 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x half> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
   ret <4 x i16> %sext
@@ -49,6 +66,65 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, s5
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <8 x half> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
@@ -66,6 +142,123 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, eq
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, eq
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp oeq <16 x half> %op1, %op2
@@ -84,6 +277,11 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x float> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
@@ -99,6 +297,11 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x float> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
@@ -116,6 +319,15 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %cmp = fcmp oeq <8 x float> %op1, %op2
@@ -132,6 +344,11 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcmeq d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <1 x double> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
   ret <1 x i64> %sext
@@ -147,6 +364,11 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x double> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %sext
@@ -164,6 +386,15 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcmeq v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %cmp = fcmp oeq <4 x double> %op1, %op2
@@ -192,6 +423,139 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ueq_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h2
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s6, h16
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w13, eq
+; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s4, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w15, eq
+; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s3
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w16, eq
+; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s4, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h3
+; NONEON-NOSVE-NEXT:    fmov s2, w12
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w17, eq
+; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmov s3, w17
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
+; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
+; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
+; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ueq <16 x half> %op1, %op2
@@ -220,6 +584,139 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_one_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h2
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s6, h16
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w13, mi
+; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
+; NONEON-NOSVE-NEXT:    csetm w14, mi
+; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s4, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w15, mi
+; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s3
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w16, mi
+; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s4, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h3
+; NONEON-NOSVE-NEXT:    fmov s2, w12
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w17, mi
+; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmov s3, w17
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
+; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
+; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
+; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp one <16 x half> %op1, %op2
@@ -244,6 +741,123 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_une_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp une <16 x half> %op1, %op2
@@ -268,6 +882,123 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ogt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, gt
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, gt
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, gt
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ogt <16 x half> %op1, %op2
@@ -295,6 +1026,123 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ugt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, hi
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, hi
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, hi
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, hi
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, hi
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, hi
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, hi
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, hi
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ugt <16 x half> %op1, %op2
@@ -319,6 +1167,123 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_olt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, mi
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, mi
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, mi
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, mi
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, mi
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, mi
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, mi
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, mi
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp olt <16 x half> %op1, %op2
@@ -346,6 +1311,123 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ult_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, lt
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, lt
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, lt
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ult <16 x half> %op1, %op2
@@ -370,6 +1452,123 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oge_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ge
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ge
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ge
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp oge <16 x half> %op1, %op2
@@ -397,6 +1596,123 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_uge_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, pl
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, pl
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, pl
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, pl
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, pl
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, pl
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, pl
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, pl
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, pl
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp uge <16 x half> %op1, %op2
@@ -421,6 +1737,123 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ole_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ls
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ls
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ls
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ls
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ls
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ls
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ls
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ls
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ls
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ole <16 x half> %op1, %op2
@@ -448,6 +1881,123 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ule_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, le
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, le
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, le
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ule <16 x half> %op1, %op2
@@ -472,6 +2022,123 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_uno_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, vs
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, vs
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, vs
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, vs
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, vs
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, vs
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, vs
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, vs
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, vs
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp uno <16 x half> %op1, %op2
@@ -499,6 +2166,123 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ord_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, vc
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, vc
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, vc
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ord <16 x half> %op1, %op2
@@ -523,6 +2307,123 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_eq_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, eq
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, eq
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast oeq <16 x half> %op1, %op2
@@ -547,6 +2448,123 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ne_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast one <16 x half> %op1, %op2
@@ -571,6 +2589,123 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_gt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, gt
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, gt
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, gt
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast ogt <16 x half> %op1, %op2
@@ -595,6 +2730,123 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_lt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, lt
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, lt
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, lt
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast olt <16 x half> %op1, %op2
@@ -619,6 +2871,123 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ge_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ge
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ge
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ge
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast oge <16 x half> %op1, %op2
@@ -643,6 +3012,123 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_le_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, le
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, le
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, le
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast ole <16 x half> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index 57d072a7bcd68b..055af194be211a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,17 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fp_convert_combine_crash:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov v0.4s, #8.00000000
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmul v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %f = load <8 x float>, ptr %a
   %mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,
                                  float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 6a2dc3c7182527..ce8902cfa16c3d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,12 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f16_to_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <2 x half> %a to <2 x float>
   store <2 x float> %res, ptr %b
   ret void
@@ -31,6 +38,12 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f16_to_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <4 x half> %a to <4 x float>
   store <4 x float> %res, ptr %b
   ret void
@@ -48,6 +61,17 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <8 x half> %a to <8 x float>
   store <8 x float> %res, ptr %b
   ret void
@@ -72,6 +96,21 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) {
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    stp q2, q1, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <16 x half> %a to <16 x float>
   store <16 x float> %res, ptr %b
   ret void
@@ -90,6 +129,13 @@ define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
@@ -104,6 +150,13 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x float>
   store <4 x float> %res, ptr %b
@@ -121,6 +174,18 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -145,6 +210,22 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -162,6 +243,13 @@ define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt d0, h0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f16_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x half>, ptr %a
   %res = fpext <1 x half> %op1 to <1 x double>
   store <1 x double> %res, ptr %b
@@ -176,6 +264,14 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x double>
   store <2 x double> %res, ptr %b
@@ -193,6 +289,19 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -217,6 +326,26 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -258,6 +387,38 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    stp q1, q2, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtl v5.2d, v5.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v7.2s
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v6.2s
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -275,6 +436,13 @@ define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt d0, s0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f32_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x float>, ptr %a
   %res = fpext <1 x float> %op1 to <1 x double>
   store <1 x double> %res, ptr %b
@@ -289,6 +457,13 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fpext <2 x float> %op1 to <2 x double>
   store <2 x double> %res, ptr %b
@@ -306,6 +481,18 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fpext <4 x float> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -330,6 +517,22 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fpext <8 x float> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -348,6 +551,13 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fptrunc <2 x float> %op1 to <2 x half>
   store <2 x half> %res, ptr %b
@@ -362,6 +572,13 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptrunc <4 x float> %op1 to <4 x half>
   store <4 x half> %res, ptr %b
@@ -379,6 +596,14 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptrunc <8 x float> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -397,6 +622,13 @@ define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x double>, ptr %a
   %res = fptrunc <1 x double> %op1 to <1 x half>
   store <1 x half> %res, ptr %b
@@ -411,6 +643,16 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    mov d1, v0.d[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    fcvt h1, d1
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
+; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %res = fptrunc <2 x double> %op1 to <2 x half>
   store <2 x half> %res, ptr %b
@@ -428,6 +670,21 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    mov d1, v0.d[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    fcvt h1, d1
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, d2
+; NONEON-NOSVE-NEXT:    mov d2, v2.d[1]
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, d2
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x half>
   store <4 x half> %res, ptr %b
@@ -446,6 +703,13 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <1 x double> %op1 to <1 x float>
   store <1 x float> %res, ptr %b
   ret void
@@ -459,6 +723,12 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <2 x double> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
   ret void
@@ -475,6 +745,14 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x float>
   store <4 x float> %res, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index 153a04f4865715..9d2b55903f3141 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,18 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x half> %op1, %op2
   %res = fadd contract <4 x half> %mul, %op3
   ret <4 x half> %res
@@ -32,6 +45,26 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fmul v3.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <8 x half> %op1, %op2
   %res = fadd contract <8 x half> %mul, %op3
   ret <8 x half> %res
@@ -49,6 +82,46 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x2]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %op3 = load <16 x half>, ptr %c
@@ -68,6 +141,12 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x float> %op1, %op2
   %res = fadd contract <2 x float> %mul, %op3
   ret <2 x float> %res
@@ -83,6 +162,12 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x float> %op1, %op2
   %res = fadd contract <4 x float> %mul, %op3
   ret <4 x float> %res
@@ -100,6 +185,16 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %op3 = load <8 x float>, ptr %c
@@ -114,6 +209,11 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmadd d0, d0, d1, d2
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmadd d0, d0, d1, d2
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <1 x double> %op1, %op2
   %res = fadd contract <1 x double> %mul, %op3
   ret <1 x double> %res
@@ -129,6 +229,12 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x double> %op1, %op2
   %res = fadd contract <2 x double> %mul, %op3
   ret <2 x double> %res
@@ -146,6 +252,16 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %op3 = load <4 x double>, ptr %c
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 6945a6102c0553..a96adfec2ad105 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,38 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -30,6 +63,64 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s4
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fmaxnm s5, s5, s16
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s3, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s6, s3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -45,6 +136,119 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    fcvt s20, h3
+; NONEON-NOSVE-NEXT:    fcvt s21, h2
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s19, s4
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmaxnm s20, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
+; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s6
+; NONEON-NOSVE-NEXT:    fmaxnm s16, s18, s17
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s18, h19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h17, s5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt h5, s20
+; NONEON-NOSVE-NEXT:    fmaxnm s20, s22, s21
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s21, h23
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
+; NONEON-NOSVE-NEXT:    fmaxnm s7, s18, s7
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s20
+; NONEON-NOSVE-NEXT:    fmaxnm s19, s21, s19
+; NONEON-NOSVE-NEXT:    fcvt s20, h23
+; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s6, h17
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s19
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s6, s17, s6
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmaxnm s18, s20, s18
+; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s7, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s16, h21
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    fmaxnm s7, s22, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s16, s21, s16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
+; NONEON-NOSVE-NEXT:    fmaxnm s6, s19, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmaxnm s17, s23, s20
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -61,6 +265,11 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnm v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -74,6 +283,11 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -89,6 +303,15 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -101,6 +324,11 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmaxnm d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -114,6 +342,11 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -129,6 +362,15 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmaxnm v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -149,6 +391,38 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fminnm s5, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    fminnm s3, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    fminnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -162,6 +436,64 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fminnm s3, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s4
+; NONEON-NOSVE-NEXT:    fminnm s4, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fminnm s5, s5, s16
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s3, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fminnm s3, s6, s3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fminnm s6, s16, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -177,6 +509,119 @@ define void @fminnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    fcvt s20, h3
+; NONEON-NOSVE-NEXT:    fcvt s21, h2
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fminnm s4, s19, s4
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
+; NONEON-NOSVE-NEXT:    fminnm s20, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
+; NONEON-NOSVE-NEXT:    fminnm s5, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
+; NONEON-NOSVE-NEXT:    fminnm s6, s16, s6
+; NONEON-NOSVE-NEXT:    fminnm s16, s18, s17
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s18, h19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h17, s5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt h5, s20
+; NONEON-NOSVE-NEXT:    fminnm s20, s22, s21
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s21, h23
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
+; NONEON-NOSVE-NEXT:    fminnm s7, s18, s7
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s20
+; NONEON-NOSVE-NEXT:    fminnm s19, s21, s19
+; NONEON-NOSVE-NEXT:    fcvt s20, h23
+; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s6, h17
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s19
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s6, s17, s6
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fminnm s18, s20, s18
+; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s7, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s16, h21
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    fminnm s7, s22, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s16, s21, s16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
+; NONEON-NOSVE-NEXT:    fminnm s6, s19, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fminnm s17, s23, s20
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.minnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -193,6 +638,11 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnm v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -206,6 +656,11 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -221,6 +676,15 @@ define void @fminnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fminnm v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.minnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -233,6 +697,11 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fminnm d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -246,6 +715,11 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnm v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -261,6 +735,15 @@ define void @fminnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fminnm v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.minnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -281,6 +764,38 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fmax s5, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmax s3, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    fmax s1, s4, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -294,6 +809,64 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fmax s3, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s4
+; NONEON-NOSVE-NEXT:    fmax s4, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fmax s5, s5, s16
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s3, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fmax s3, s6, s3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fmax s6, s16, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -309,6 +882,119 @@ define void @fmax_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    fcvt s20, h3
+; NONEON-NOSVE-NEXT:    fcvt s21, h2
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fmax s4, s19, s4
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmax s20, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
+; NONEON-NOSVE-NEXT:    fmax s5, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmax s6, s16, s6
+; NONEON-NOSVE-NEXT:    fmax s16, s18, s17
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s18, h19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h17, s5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt h5, s20
+; NONEON-NOSVE-NEXT:    fmax s20, s22, s21
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s21, h23
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
+; NONEON-NOSVE-NEXT:    fmax s7, s18, s7
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s20
+; NONEON-NOSVE-NEXT:    fmax s19, s21, s19
+; NONEON-NOSVE-NEXT:    fcvt s20, h23
+; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s6, h17
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s19
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s6, s17, s6
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmax s18, s20, s18
+; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s7, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s16, h21
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    fmax s7, s22, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s16, s21, s16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
+; NONEON-NOSVE-NEXT:    fmax s6, s19, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmax s17, s23, s20
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.maximum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -325,6 +1011,11 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -338,6 +1029,11 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -353,6 +1049,15 @@ define void @fmax_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.maximum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -365,6 +1070,11 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmax d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmax d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -378,6 +1088,11 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmax v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -393,6 +1108,15 @@ define void @fmax_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmax v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.maximum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -413,6 +1137,38 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fmin s5, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmin s3, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    fmin s1, s4, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -426,6 +1182,64 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fmin s3, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s4
+; NONEON-NOSVE-NEXT:    fmin s4, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fmin s5, s5, s16
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s3, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fmin s3, s6, s3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fmin s6, s16, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -441,6 +1255,119 @@ define void @fmin_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    fcvt s20, h3
+; NONEON-NOSVE-NEXT:    fcvt s21, h2
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fmin s4, s19, s4
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmin s20, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
+; NONEON-NOSVE-NEXT:    fmin s5, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmin s6, s16, s6
+; NONEON-NOSVE-NEXT:    fmin s16, s18, s17
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s18, h19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h17, s5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt h5, s20
+; NONEON-NOSVE-NEXT:    fmin s20, s22, s21
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s21, h23
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
+; NONEON-NOSVE-NEXT:    fmin s7, s18, s7
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s20
+; NONEON-NOSVE-NEXT:    fmin s19, s21, s19
+; NONEON-NOSVE-NEXT:    fcvt s20, h23
+; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s6, h17
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s19
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s6, s17, s6
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmin s18, s20, s18
+; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s7, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s16, h21
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    fmin s7, s22, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s16, s21, s16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
+; NONEON-NOSVE-NEXT:    fmin s6, s19, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmin s17, s23, s20
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.minimum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -457,6 +1384,11 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -470,6 +1402,11 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -485,6 +1422,15 @@ define void @fmin_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.minimum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -497,6 +1443,11 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmin d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmin d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -510,6 +1461,11 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmin v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -525,6 +1481,15 @@ define void @fmin_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmin v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.minimum.v4f64(<4 x double> %op1, <4 x double> %op2)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
index e239ff5e35fd36..f1561011e21812 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -26,6 +27,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ; NO-FA64-NEXT:    fadd h0, h0, h2
 ; NO-FA64-NEXT:    fadd h0, h0, h1
 ; NO-FA64-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index 78ae7bb6cf30ab..a0a7dad835662e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,6 +20,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
@@ -43,6 +68,49 @@ define half @fadda_v8f16(half %start, <8 x half> %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
@@ -83,6 +151,90 @@ define half @fadda_v16f16(half %start, ptr %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
@@ -96,6 +248,14 @@ define float @fadda_v2f32(float %start, <2 x float> %a) {
 ; CHECK-NEXT:    mov z1.s, z1.s[1]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
@@ -112,6 +272,17 @@ define float @fadda_v4f32(float %start, <4 x float> %a) {
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
@@ -136,6 +307,26 @@ define float @fadda_v8f32(float %start, ptr %a) {
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
@@ -146,6 +337,11 @@ define double @fadda_v1f64(double %start, <1 x double> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
@@ -158,6 +354,13 @@ define double @fadda_v2f64(double %start, <2 x double> %a) {
 ; CHECK-NEXT:    mov z1.d, z1.d[1]
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
@@ -174,6 +377,17 @@ define double @fadda_v4f64(double %start, ptr %a) {
 ; CHECK-NEXT:    mov z1.d, z1.d[1]
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov d2, v3.d[1]
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d3
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
@@ -191,6 +405,30 @@ define half @faddv_v4f16(half %start, <4 x half> %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
@@ -203,6 +441,49 @@ define half @faddv_v8f16(half %start, <8 x half> %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
@@ -216,6 +497,58 @@ define half @faddv_v16f16(half %start, ptr %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fadd v3.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
+; NONEON-NOSVE-NEXT:    mov h1, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
@@ -229,6 +562,12 @@ define float @faddv_v2f32(float %start, <2 x float> %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
@@ -241,6 +580,13 @@ define float @faddv_v4f32(float %start, <4 x float> %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
@@ -254,6 +600,15 @@ define float @faddv_v8f32(float %start, ptr %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
@@ -264,6 +619,11 @@ define double @faddv_v1f64(double %start, <1 x double> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
@@ -276,6 +636,12 @@ define double @faddv_v2f64(double %start, <2 x double> %a) {
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
@@ -289,6 +655,14 @@ define double @faddv_v4f64(double %start, ptr %a) {
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v1.2d
+; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
@@ -306,6 +680,26 @@ define half @fmaxv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -318,6 +712,45 @@ define half @fmaxv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -331,6 +764,85 @@ define half @fmaxv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s4, s2
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
   ret half %res
@@ -344,6 +856,11 @@ define float @fmaxv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnmp s0, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -356,6 +873,11 @@ define float @fmaxv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -369,6 +891,13 @@ define float @fmaxv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
   ret float %res
@@ -378,6 +907,10 @@ define double @fmaxv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fmaxv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -390,6 +923,11 @@ define double @fmaxv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -403,6 +941,13 @@ define double @fmaxv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
   ret double %res
@@ -420,6 +965,26 @@ define half @fminv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -432,6 +997,45 @@ define half @fminv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -445,6 +1049,85 @@ define half @fminv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fminnm s2, s4, s2
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fminnm s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
   ret half %res
@@ -458,6 +1141,11 @@ define float @fminv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnmp s0, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -470,6 +1158,11 @@ define float @fminv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -483,6 +1176,13 @@ define float @fminv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
   ret float %res
@@ -492,6 +1192,10 @@ define double @fminv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fminv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -504,6 +1208,11 @@ define double @fminv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -517,6 +1226,13 @@ define double @fminv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
   ret double %res
@@ -534,6 +1250,26 @@ define half @fmaximumv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -546,6 +1282,45 @@ define half @fmaximumv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -559,6 +1334,85 @@ define half @fmaximumv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmax s2, s4, s2
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmax s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op)
   ret half %res
@@ -572,6 +1426,11 @@ define float @fmaximumv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxp s0, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -584,6 +1443,11 @@ define float @fmaximumv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -597,6 +1461,13 @@ define float @fmaximumv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op)
   ret float %res
@@ -606,6 +1477,10 @@ define double @fmaximumv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fmaximumv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -618,6 +1493,11 @@ define double @fmaximumv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -631,6 +1511,13 @@ define double @fmaximumv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op)
   ret double %res
@@ -648,6 +1535,26 @@ define half @fminimumv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -660,6 +1567,45 @@ define half @fminimumv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -673,6 +1619,85 @@ define half @fminimumv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmin s2, s4, s2
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmin s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op)
   ret half %res
@@ -686,6 +1711,11 @@ define float @fminimumv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminp s0, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -698,6 +1728,11 @@ define float @fminimumv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -711,6 +1746,13 @@ define float @fminimumv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op)
   ret float %res
@@ -720,6 +1762,10 @@ define double @fminimumv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fminimumv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -732,6 +1778,11 @@ define double @fminimumv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -745,6 +1796,13 @@ define double @fminimumv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op)
   ret double %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 412c27cb82f1d4..6af2b885ace08f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,13 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.ceil.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -28,6 +36,13 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -40,6 +55,16 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintp v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -53,6 +78,24 @@ define void @frintp_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintp z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintp v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintp v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -67,6 +110,11 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintp v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -79,6 +127,11 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -92,6 +145,14 @@ define void @frintp_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintp z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -103,6 +164,11 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintp d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -115,6 +181,11 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -128,6 +199,14 @@ define void @frintp_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintp z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintp v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -146,6 +225,13 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.floor.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -158,6 +244,13 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -170,6 +263,16 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintm v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -183,6 +286,24 @@ define void @frintm_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintm z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintm v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintm v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -197,6 +318,11 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintm v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -209,6 +335,11 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -222,6 +353,14 @@ define void @frintm_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintm z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -233,6 +372,11 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintm d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -245,6 +389,11 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -258,6 +407,14 @@ define void @frintm_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintm z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintm v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -276,6 +433,13 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.nearbyint.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -288,6 +452,13 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -300,6 +471,16 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frinti v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -313,6 +494,24 @@ define void @frinti_v16f16(ptr %a) {
 ; CHECK-NEXT:    frinti z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frinti v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frinti v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -327,6 +526,11 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinti v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -339,6 +543,11 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -352,6 +561,14 @@ define void @frinti_v8f32(ptr %a) {
 ; CHECK-NEXT:    frinti z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -363,6 +580,11 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frinti d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -375,6 +597,11 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -388,6 +615,14 @@ define void @frinti_v4f64(ptr %a) {
 ; CHECK-NEXT:    frinti z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frinti v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -406,6 +641,13 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.rint.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -418,6 +660,13 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -430,6 +679,16 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintx v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -443,6 +702,24 @@ define void @frintx_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintx v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintx v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -457,6 +734,11 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintx v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -469,6 +751,11 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -482,6 +769,14 @@ define void @frintx_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -493,6 +788,11 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintx d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -505,6 +805,11 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -518,6 +823,14 @@ define void @frintx_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintx v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -536,6 +849,13 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.round.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -548,6 +868,13 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -560,6 +887,16 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frinta v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -573,6 +910,24 @@ define void @frinta_v16f16(ptr %a) {
 ; CHECK-NEXT:    frinta z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frinta v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frinta v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -587,6 +942,11 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinta v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -599,6 +959,11 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -612,6 +977,14 @@ define void @frinta_v8f32(ptr %a) {
 ; CHECK-NEXT:    frinta z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -623,6 +996,11 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frinta d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -635,6 +1013,11 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -648,6 +1031,14 @@ define void @frinta_v4f64(ptr %a) {
 ; CHECK-NEXT:    frinta z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frinta v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -666,6 +1057,13 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -678,6 +1076,13 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -690,6 +1095,16 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintn v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -703,6 +1118,24 @@ define void @frintn_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintn z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintn v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintn v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -717,6 +1150,11 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintn v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -729,6 +1167,11 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -742,6 +1185,14 @@ define void @frintn_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintn z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -753,6 +1204,11 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintn d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -765,6 +1221,11 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -778,6 +1239,14 @@ define void @frintn_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintn z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintn v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -796,6 +1265,13 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.trunc.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -808,6 +1284,13 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -820,6 +1303,16 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintz v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -833,6 +1326,24 @@ define void @frintz_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintz v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintz v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -847,6 +1358,11 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -859,6 +1375,11 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -872,6 +1393,14 @@ define void @frintz_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -883,6 +1412,11 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintz d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -895,6 +1429,11 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -908,6 +1447,14 @@ define void @frintz_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintz v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 89697cde848b53..824419b31a5a83 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,14 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
 }
@@ -32,6 +41,14 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
 }
@@ -48,6 +65,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.8h, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
 }
@@ -67,6 +92,20 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x half>, ptr %a
   %op2 = load volatile <16 x half>, ptr %b
   %sel = select i1 %mask, <16 x half> %op1, <16 x half> %op2
@@ -86,6 +125,14 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
 }
@@ -102,6 +149,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4s, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
 }
@@ -121,6 +176,20 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x float>, ptr %a
   %op2 = load volatile <8 x float>, ptr %b
   %sel = select i1 %mask, <8 x float> %op1, <8 x float> %op2
@@ -134,6 +203,14 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    fmov d2, x8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
 }
@@ -151,6 +228,14 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2d, x8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
 }
@@ -171,6 +256,20 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x double>, ptr %a
   %op2 = load volatile <4 x double>, ptr %b
   %sel = select i1 %mask, <4 x double> %op1, <4 x double> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 5840ffb20994ce..c853bdc5af8db0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,6 +16,13 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -27,6 +35,21 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, ptr %b
@@ -42,6 +65,27 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -61,6 +105,13 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -74,6 +125,12 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -90,6 +147,20 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -114,6 +185,26 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, ptr %b
@@ -130,6 +221,13 @@ define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu x8, h0
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -145,6 +243,18 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -167,6 +277,27 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x10, s2
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s3
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptoui <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -204,6 +335,47 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvtzu x13, s2
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h7
+; NONEON-NOSVE-NEXT:    fcvtzu x10, s3
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s4
+; NONEON-NOSVE-NEXT:    fcvtzu x12, s5
+; NONEON-NOSVE-NEXT:    fcvtzu x14, s6
+; NONEON-NOSVE-NEXT:    fmov d3, x13
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d2, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
+; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -264,6 +436,80 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h0
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s17, h4
+; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s6
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s2
+; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
+; NONEON-NOSVE-NEXT:    fcvtzu x10, s17
+; NONEON-NOSVE-NEXT:    fcvtzu x13, s5
+; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
+; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s17, h18
+; NONEON-NOSVE-NEXT:    fcvtzu x14, s7
+; NONEON-NOSVE-NEXT:    fmov d7, x8
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fmov d0, x11
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s1
+; NONEON-NOSVE-NEXT:    fmov d1, x13
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvtzu x13, s16
+; NONEON-NOSVE-NEXT:    fmov d16, x9
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvtzu x15, s17
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s2
+; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s6
+; NONEON-NOSVE-NEXT:    fcvtzu x14, s4
+; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
+; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
+; NONEON-NOSVE-NEXT:    fmov d3, x10
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s5
+; NONEON-NOSVE-NEXT:    fmov d2, x15
+; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d4, x8
+; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
+; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
+; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
+; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, ptr %b
@@ -282,6 +528,11 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -295,6 +546,12 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -312,6 +569,14 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -336,6 +601,19 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f32_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptoui <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -354,6 +632,11 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -366,6 +649,11 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -379,6 +667,14 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -398,6 +694,13 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f32_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -411,6 +714,12 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -427,6 +736,20 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptoui <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -451,6 +774,26 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -468,6 +811,12 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
@@ -481,6 +830,12 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -509,6 +864,15 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -552,6 +916,23 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI26_0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
+; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
+; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
+; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -628,6 +1009,35 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI27_0
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
+; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI27_0]
+; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
+; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
+; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
+; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
+; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
+; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
+; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
+; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptoui <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -647,6 +1057,13 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
@@ -660,6 +1077,12 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -677,6 +1100,14 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -701,6 +1132,19 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -719,6 +1163,12 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -731,6 +1181,11 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -744,6 +1199,14 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -762,6 +1225,13 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -774,6 +1244,21 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, ptr %b
@@ -789,6 +1274,27 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -808,6 +1314,13 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -821,6 +1334,12 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -837,6 +1356,20 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -861,6 +1394,26 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, ptr %b
@@ -877,6 +1430,13 @@ define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs x8, h0
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -893,6 +1453,18 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -915,6 +1487,27 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x10, s2
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s3
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptosi <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -952,6 +1545,47 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvtzs x13, s2
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h7
+; NONEON-NOSVE-NEXT:    fcvtzs x10, s3
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s4
+; NONEON-NOSVE-NEXT:    fcvtzs x12, s5
+; NONEON-NOSVE-NEXT:    fcvtzs x14, s6
+; NONEON-NOSVE-NEXT:    fmov d3, x13
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d2, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
+; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -1012,6 +1646,80 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h0
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s17, h4
+; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s6
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s2
+; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
+; NONEON-NOSVE-NEXT:    fcvtzs x10, s17
+; NONEON-NOSVE-NEXT:    fcvtzs x13, s5
+; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
+; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s17, h18
+; NONEON-NOSVE-NEXT:    fcvtzs x14, s7
+; NONEON-NOSVE-NEXT:    fmov d7, x8
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fmov d0, x11
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s1
+; NONEON-NOSVE-NEXT:    fmov d1, x13
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvtzs x13, s16
+; NONEON-NOSVE-NEXT:    fmov d16, x9
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvtzs x15, s17
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s2
+; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s6
+; NONEON-NOSVE-NEXT:    fcvtzs x14, s4
+; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
+; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
+; NONEON-NOSVE-NEXT:    fmov d3, x10
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s5
+; NONEON-NOSVE-NEXT:    fmov d2, x15
+; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d4, x8
+; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
+; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
+; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
+; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, ptr %b
@@ -1030,6 +1738,11 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -1043,6 +1756,12 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -1060,6 +1779,14 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -1084,6 +1811,19 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f32_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptosi <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -1102,6 +1842,11 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1114,6 +1859,11 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -1127,6 +1877,14 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -1146,6 +1904,13 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f32_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1159,6 +1924,12 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1175,6 +1946,20 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptosi <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -1199,6 +1984,26 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -1218,6 +2023,12 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
@@ -1231,6 +2042,12 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -1259,6 +2076,15 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -1302,6 +2128,23 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI61_0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI61_0]
+; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
+; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
+; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -1378,6 +2221,35 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI62_0
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
+; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI62_0]
+; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
+; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
+; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
+; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
+; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
+; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
+; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
+; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptosi <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -1397,6 +2269,13 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
@@ -1410,6 +2289,12 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1427,6 +2312,14 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -1451,6 +2344,19 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -1469,6 +2375,12 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1481,6 +2393,11 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1494,6 +2411,14 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index c1c7b5c05f5d55..d3b09374676556 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -27,6 +28,14 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uzp1 v2.4h, v2.4h, v0.4h
+; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
 }
@@ -45,6 +54,13 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
 }
@@ -64,6 +80,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
 }
@@ -80,6 +104,126 @@ define void @select_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h17, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w12, eq
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w11, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[3]
+; NONEON-NOSVE-NEXT:    csetm w13, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v3.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s17, s16
+; NONEON-NOSVE-NEXT:    mov h16, v3.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s6, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h2
+; NONEON-NOSVE-NEXT:    csetm w15, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    fmov s4, w14
+; NONEON-NOSVE-NEXT:    csetm w16, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s16, h17
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w8
+; NONEON-NOSVE-NEXT:    fcvt s17, h18
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    fmov s5, w14
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s16, s7
+; NONEON-NOSVE-NEXT:    mov h7, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov h16, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w12
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w16
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s17
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    mov h16, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w11
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov h7, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w13
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcmp s17, s16
+; NONEON-NOSVE-NEXT:    mov h16, v3.h[7]
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w8
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s6, h16
+; NONEON-NOSVE-NEXT:    fcvt s7, h17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w8
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w10
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w8
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %mask = fcmp oeq <16 x half> %op1, %op2
@@ -102,6 +246,13 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
 }
@@ -121,6 +272,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
 }
@@ -137,6 +296,18 @@ define void @select_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %mask = fcmp oeq <8 x float> %op1, %op2
@@ -151,6 +322,14 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    fmov d2, x8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
 }
@@ -170,6 +349,14 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
+; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
 }
@@ -186,6 +373,18 @@ define void @select_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    fcmeq v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcmeq v5.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %mask = fcmp oeq <4 x double> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index ff38db8c10c04b..ae97a266c6ff0d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -21,6 +22,14 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i8> %op1, i8 5, i64 3
     ret <4 x i8> %r
 }
@@ -38,6 +47,14 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i8> %op1, i8 5, i64 7
     ret <8 x i8> %r
 }
@@ -55,6 +72,12 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i8> %op1, i8 5, i64 15
     ret <16 x i8> %r
 }
@@ -72,6 +95,12 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 ; CHECK-NEXT:    mov z1.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v1.b[15], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <32 x i8> %op1, i8 5, i64 31
     ret <32 x i8> %r
 }
@@ -90,6 +119,14 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i16> %op1, i16 5, i64 1
     ret <2 x i16> %r
 }
@@ -107,6 +144,14 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i16> %op1, i16 5, i64 3
     ret <4 x i16> %r
 }
@@ -124,6 +169,12 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.h[7], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i16> %op1, i16 5, i64 7
     ret <8 x i16> %r
 }
@@ -141,6 +192,12 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 ; CHECK-NEXT:    mov z1.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v1.h[7], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i16> %op1, i16 5, i64 15
     ret <16 x i16> %r
 }
@@ -159,6 +216,14 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i32> %op1, i32 5, i64 1
     ret <2 x i32> %r
 }
@@ -176,6 +241,12 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i32> %op1, i32 5, i64 3
     ret <4 x i32> %r
 }
@@ -193,6 +264,13 @@ define <8 x i32> @insertelement_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x i32>, ptr %a
     %r = insertelement <8 x i32> %op1, i32 5, i64 7
     ret <8 x i32> %r
@@ -205,6 +283,12 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    mov z0.d, #5 // =0x5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x i64> %op1, i64 5, i64 0
     ret <1 x i64> %r
 }
@@ -222,6 +306,12 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i64> %op1, i64 5, i64 1
     ret <2 x i64> %r
 }
@@ -239,6 +329,13 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x8
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x i64>, ptr %a
     %r = insertelement <4 x i64> %op1, i64 5, i64 3
     ret <4 x i64> %r
@@ -257,6 +354,16 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI14_0
+; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI14_0
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    ld1r { v1.4h }, [x8]
+; NONEON-NOSVE-NEXT:    mov v1.h[0], v0.h[0]
+; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x half> %op1, half 5.0, i64 1
     ret <2 x half> %r
 }
@@ -274,6 +381,15 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI15_0
+; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI15_0
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x half> %op1, half 5.0, i64 3
     ret <4 x half> %r
 }
@@ -291,6 +407,13 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI16_0
+; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI16_0
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x half> %op1, half 5.0, i64 7
     ret <8 x half> %r
 }
@@ -308,6 +431,14 @@ define <16 x half> @insertelement_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z1.h, p0/m, h2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI17_0
+; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI17_0
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
     %r = insertelement <16 x half> %op1, half 5.0, i64 15
     ret <16 x half> %r
@@ -327,6 +458,14 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov v0.s[1], v1.s[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x float> %op1, float 5.0, i64 1
     ret <2 x float> %r
 }
@@ -344,6 +483,12 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
+; NONEON-NOSVE-NEXT:    mov v0.s[3], v1.s[0]
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x float> %op1, float 5.0, i64 3
     ret <4 x float> %r
 }
@@ -361,6 +506,13 @@ define <8 x float> @insertelement_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/m, s2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov s2, #5.00000000
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov v1.s[3], v2.s[0]
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
     %r = insertelement <8 x float> %op1, float 5.0, i64 7
     ret <8 x float> %r
@@ -372,6 +524,12 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, #5.00000000
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x double> %op1, double 5.0, i64 0
     ret <1 x double> %r
 }
@@ -389,6 +547,12 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 ; CHECK-NEXT:    mov z0.d, p0/m, d1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov d1, #5.00000000
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x double> %op1, double 5.0, i64 1
     ret <2 x double> %r
 }
@@ -406,6 +570,14 @@ define <4 x double> @insertelement_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/m, d2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov d0, #5.00000000
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x double>, ptr %a
     %r = insertelement <4 x double> %op1, double 5.0, i64 3
     ret <4 x double> %r
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index ee1706bc7c3549..1b438559e05380 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,11 @@ define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -28,6 +34,11 @@ define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -40,6 +51,11 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = add <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -53,6 +69,15 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = add <32 x i8> %op1, %op2
@@ -68,6 +93,11 @@ define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -80,6 +110,11 @@ define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -92,6 +127,11 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -105,6 +145,15 @@ define void @add_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = add <16 x i16> %op1, %op2
@@ -120,6 +169,11 @@ define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -132,6 +186,11 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -145,6 +204,15 @@ define void @add_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = add <8 x i32> %op1, %op2
@@ -160,6 +228,11 @@ define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = add <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -172,6 +245,11 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -185,6 +263,15 @@ define void @add_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    add v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = add <4 x i64> %op1, %op2
@@ -213,6 +300,11 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -234,6 +326,11 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -255,6 +352,11 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -279,6 +381,15 @@ define void @mul_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    mul v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mul v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = mul <32 x i8> %op1, %op2
@@ -303,6 +414,11 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -324,6 +440,11 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -345,6 +466,11 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -369,6 +495,15 @@ define void @mul_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    mul v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    mul v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = mul <16 x i16> %op1, %op2
@@ -393,6 +528,11 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -414,6 +554,11 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -438,6 +583,15 @@ define void @mul_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    mul v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    mul v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = mul <8 x i32> %op1, %op2
@@ -462,6 +616,16 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    mul z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -483,6 +647,18 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    mul z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x10, d1
+; NONEON-NOSVE-NEXT:    fmov x11, d0
+; NONEON-NOSVE-NEXT:    mov x8, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
+; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -507,6 +683,29 @@ define void @mul_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    fmov x12, d2
+; NONEON-NOSVE-NEXT:    mov x11, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x10, v3.d[1]
+; NONEON-NOSVE-NEXT:    mov x13, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x14, v0.d[1]
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov x9, d3
+; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    mul x9, x12, x9
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    mul x11, x14, x13
+; NONEON-NOSVE-NEXT:    fmov d0, x9
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = mul <4 x i64> %op1, %op2
@@ -526,6 +725,11 @@ define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -538,6 +742,11 @@ define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -550,6 +759,11 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -563,6 +777,15 @@ define void @sub_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sub v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = sub <32 x i8> %op1, %op2
@@ -578,6 +801,11 @@ define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -590,6 +818,11 @@ define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -602,6 +835,11 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -615,6 +853,15 @@ define void @sub_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = sub <16 x i16> %op1, %op2
@@ -630,6 +877,11 @@ define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -642,6 +894,11 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -655,6 +912,15 @@ define void @sub_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = sub <8 x i32> %op1, %op2
@@ -670,6 +936,11 @@ define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -682,6 +953,11 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -695,6 +971,15 @@ define void @sub_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = sub <4 x i64> %op1, %op2
@@ -715,6 +1000,13 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false)
   ret <4 x i8> %res
 }
@@ -727,6 +1019,11 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
   ret <8 x i8> %res
 }
@@ -739,6 +1036,11 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
   ret <16 x i8> %res
 }
@@ -752,6 +1054,14 @@ define void @abs_v32i8(ptr %a) {
 ; CHECK-NEXT:    abs z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    abs v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
   store <32 x i8> %res, ptr %a
@@ -767,6 +1077,13 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false)
   ret <2 x i16> %res
 }
@@ -779,6 +1096,11 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
   ret <4 x i16> %res
 }
@@ -791,6 +1113,11 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
   ret <8 x i16> %res
 }
@@ -804,6 +1131,14 @@ define void @abs_v16i16(ptr %a) {
 ; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    abs v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
   store <16 x i16> %res, ptr %a
@@ -818,6 +1153,11 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   ret <2 x i32> %res
 }
@@ -830,6 +1170,11 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   ret <4 x i32> %res
 }
@@ -843,6 +1188,14 @@ define void @abs_v8i32(ptr %a) {
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
   store <8 x i32> %res, ptr %a
@@ -857,6 +1210,11 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
   ret <1 x i64> %res
 }
@@ -869,6 +1227,11 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   ret <2 x i64> %res
 }
@@ -882,6 +1245,14 @@ define void @abs_v4i64(ptr %a) {
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index c2f3bbfb51dd52..ee0ca0e60b5e51 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,11 @@ define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i8> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i8>
   ret <8 x i8> %sext
@@ -33,6 +39,11 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <16 x i8> %op1, %op2
   %sext = sext <16 x i1> %cmp to <16 x i8>
   ret <16 x i8> %sext
@@ -50,6 +61,15 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %cmp = icmp eq <32 x i8> %op1, %op2
@@ -68,6 +88,11 @@ define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i16> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
   ret <4 x i16> %sext
@@ -83,6 +108,11 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i16> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
@@ -100,6 +130,15 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    cmeq v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %cmp = icmp eq <16 x i16> %op1, %op2
@@ -118,6 +157,11 @@ define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i32> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
@@ -133,6 +177,11 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i32> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
@@ -150,6 +199,15 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    cmeq v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %cmp = icmp eq <8 x i32> %op1, %op2
@@ -168,6 +226,11 @@ define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <1 x i64> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
   ret <1 x i64> %sext
@@ -183,6 +246,11 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i64> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %sext
@@ -200,6 +268,15 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmeq v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %cmp = icmp eq <4 x i64> %op1, %op2
@@ -224,6 +301,17 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ne_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %cmp = icmp ne <32 x i8> %op1, %op2
@@ -246,6 +334,14 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sge_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmge v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %cmp = icmp sge <8 x i16> %op1, %op2
@@ -270,6 +366,15 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sgt_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    cmgt v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %cmp = icmp sgt <16 x i16> %op1, %op2
@@ -292,6 +397,14 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sle_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
   %cmp = icmp sle <4 x i32> %op1, %op2
@@ -316,6 +429,15 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_slt_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    cmgt v1.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %cmp = icmp slt <8 x i32> %op1, %op2
@@ -338,6 +460,14 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_uge_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmhs v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp uge <2 x i64> %op1, %op2
@@ -360,6 +490,14 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ugt_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ugt <2 x i64> %op1, %op2
@@ -382,6 +520,14 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ule_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmhs v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ule <2 x i64> %op1, %op2
@@ -404,6 +550,14 @@ define void @icmp_ult_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ult_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmhi v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ult <2 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index e6fd775b4cfb9b..d79d6c18ed5a6e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,6 +25,31 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -51,6 +77,45 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[6]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[7]
+; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -98,6 +163,75 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
+; NONEON-NOSVE-NEXT:    smov w16, v0.b[7]
+; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
+; NONEON-NOSVE-NEXT:    smov w18, v0.b[9]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[10]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[11]
+; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[12]
+; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    smov w15, v1.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[13]
+; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
+; NONEON-NOSVE-NEXT:    smov w16, v1.b[8]
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
+; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
+; NONEON-NOSVE-NEXT:    smov w17, v1.b[9]
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
+; NONEON-NOSVE-NEXT:    sdiv w8, w18, w17
+; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[11]
+; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[14]
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w8, w13, w12
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[14]
+; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w9, w12, w9
+; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
+; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -178,6 +312,163 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
+; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
+; NONEON-NOSVE-NEXT:    smov w2, v0.b[10]
+; NONEON-NOSVE-NEXT:    smov w3, v0.b[11]
+; NONEON-NOSVE-NEXT:    smov w4, v0.b[12]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    smov w5, v0.b[13]
+; NONEON-NOSVE-NEXT:    smov w6, v0.b[14]
+; NONEON-NOSVE-NEXT:    smov w1, v3.b[1]
+; NONEON-NOSVE-NEXT:    smov w7, v2.b[0]
+; NONEON-NOSVE-NEXT:    smov w19, v2.b[2]
+; NONEON-NOSVE-NEXT:    smov w20, v2.b[3]
+; NONEON-NOSVE-NEXT:    smov w21, v2.b[4]
+; NONEON-NOSVE-NEXT:    smov w22, v2.b[5]
+; NONEON-NOSVE-NEXT:    smov w23, v2.b[6]
+; NONEON-NOSVE-NEXT:    smov w24, v2.b[7]
+; NONEON-NOSVE-NEXT:    smov w25, v2.b[8]
+; NONEON-NOSVE-NEXT:    smov w26, v2.b[9]
+; NONEON-NOSVE-NEXT:    smov w27, v2.b[10]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w11, w11, w10
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s5, w9
+; NONEON-NOSVE-NEXT:    smov w9, v3.b[11]
+; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w10, w12, w10
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
+; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
+; NONEON-NOSVE-NEXT:    sdiv w13, w13, w12
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
+; NONEON-NOSVE-NEXT:    smov w10, v3.b[12]
+; NONEON-NOSVE-NEXT:    sdiv w12, w14, w12
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
+; NONEON-NOSVE-NEXT:    smov w13, v2.b[14]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[7]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[7]
+; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
+; NONEON-NOSVE-NEXT:    smov w12, v2.b[13]
+; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    smov w15, v1.b[8]
+; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
+; NONEON-NOSVE-NEXT:    sdiv w18, w17, w15
+; NONEON-NOSVE-NEXT:    smov w15, v1.b[9]
+; NONEON-NOSVE-NEXT:    smov w17, v0.b[9]
+; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
+; NONEON-NOSVE-NEXT:    sdiv w17, w17, w15
+; NONEON-NOSVE-NEXT:    smov w15, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
+; NONEON-NOSVE-NEXT:    sdiv w15, w2, w15
+; NONEON-NOSVE-NEXT:    smov w2, v1.b[11]
+; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
+; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
+; NONEON-NOSVE-NEXT:    smov w3, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
+; NONEON-NOSVE-NEXT:    sdiv w3, w4, w3
+; NONEON-NOSVE-NEXT:    smov w4, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
+; NONEON-NOSVE-NEXT:    sdiv w4, w5, w4
+; NONEON-NOSVE-NEXT:    smov w5, v1.b[14]
+; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
+; NONEON-NOSVE-NEXT:    sdiv w5, w6, w5
+; NONEON-NOSVE-NEXT:    smov w6, v2.b[1]
+; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
+; NONEON-NOSVE-NEXT:    sdiv w1, w6, w1
+; NONEON-NOSVE-NEXT:    smov w6, v3.b[0]
+; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
+; NONEON-NOSVE-NEXT:    sdiv w6, w7, w6
+; NONEON-NOSVE-NEXT:    smov w7, v3.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w7, w19, w7
+; NONEON-NOSVE-NEXT:    smov w19, v3.b[3]
+; NONEON-NOSVE-NEXT:    fmov s4, w6
+; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
+; NONEON-NOSVE-NEXT:    sdiv w19, w20, w19
+; NONEON-NOSVE-NEXT:    smov w20, v3.b[4]
+; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
+; NONEON-NOSVE-NEXT:    sdiv w20, w21, w20
+; NONEON-NOSVE-NEXT:    smov w21, v3.b[5]
+; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
+; NONEON-NOSVE-NEXT:    sdiv w21, w22, w21
+; NONEON-NOSVE-NEXT:    smov w22, v3.b[6]
+; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w22, w23, w22
+; NONEON-NOSVE-NEXT:    smov w23, v3.b[7]
+; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
+; NONEON-NOSVE-NEXT:    sdiv w23, w24, w23
+; NONEON-NOSVE-NEXT:    smov w24, v3.b[8]
+; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w24, w25, w24
+; NONEON-NOSVE-NEXT:    smov w25, v3.b[9]
+; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
+; NONEON-NOSVE-NEXT:    sdiv w25, w26, w25
+; NONEON-NOSVE-NEXT:    smov w26, v3.b[10]
+; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w8, w27, w26
+; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
+; NONEON-NOSVE-NEXT:    smov w11, v2.b[12]
+; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
+; NONEON-NOSVE-NEXT:    smov w8, v3.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v3.b[13]
+; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v3.b[14]
+; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v2.b[15]
+; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
+; NONEON-NOSVE-NEXT:    sdiv w8, w13, w8
+; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = sdiv <32 x i8> %op1, %op2
@@ -196,6 +487,23 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -212,6 +520,29 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -238,6 +569,43 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[6]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -278,6 +646,79 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[6]
+; NONEON-NOSVE-NEXT:    smov w16, v2.h[1]
+; NONEON-NOSVE-NEXT:    smov w17, v2.h[0]
+; NONEON-NOSVE-NEXT:    smov w18, v2.h[2]
+; NONEON-NOSVE-NEXT:    smov w1, v2.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w2, v2.h[4]
+; NONEON-NOSVE-NEXT:    smov w3, v2.h[5]
+; NONEON-NOSVE-NEXT:    smov w4, v2.h[6]
+; NONEON-NOSVE-NEXT:    sdiv w10, w10, w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s5, w10
+; NONEON-NOSVE-NEXT:    smov w10, v3.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
+; NONEON-NOSVE-NEXT:    smov w9, v2.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    smov w14, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
+; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    smov w15, v3.h[1]
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
+; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
+; NONEON-NOSVE-NEXT:    smov w16, v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
+; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
+; NONEON-NOSVE-NEXT:    smov w17, v3.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w17, w18, w17
+; NONEON-NOSVE-NEXT:    smov w18, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmov s4, w16
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
+; NONEON-NOSVE-NEXT:    sdiv w18, w1, w18
+; NONEON-NOSVE-NEXT:    smov w1, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
+; NONEON-NOSVE-NEXT:    sdiv w1, w2, w1
+; NONEON-NOSVE-NEXT:    smov w2, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
+; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
+; NONEON-NOSVE-NEXT:    smov w3, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
+; NONEON-NOSVE-NEXT:    sdiv w8, w4, w3
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
+; NONEON-NOSVE-NEXT:    sdiv w9, w9, w10
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = sdiv <16 x i16> %op1, %op2
@@ -294,6 +735,21 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -307,6 +763,26 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w10, s0
+; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    fmov w9, s1
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -322,6 +798,45 @@ define void @sdiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w10, s0
+; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
+; NONEON-NOSVE-NEXT:    fmov w13, s2
+; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
+; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    fmov w9, s1
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    fmov w12, s3
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
+; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w12
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
+; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
+; NONEON-NOSVE-NEXT:    fmov s1, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
+; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
+; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
+; NONEON-NOSVE-NEXT:    sdiv w8, w16, w15
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = sdiv <8 x i32> %op1, %op2
@@ -338,6 +853,16 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -351,6 +876,18 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -366,6 +903,29 @@ define void @sdiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    sdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x11, d2
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
+; NONEON-NOSVE-NEXT:    fmov x10, d3
+; NONEON-NOSVE-NEXT:    sdiv x10, x11, x10
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    sdiv x11, x12, x11
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = sdiv <4 x i64> %op1, %op2
@@ -391,6 +951,37 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    and w9, w11, #0xff
+; NONEON-NOSVE-NEXT:    and w11, w12, #0xff
+; NONEON-NOSVE-NEXT:    udiv w8, w11, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -418,6 +1009,45 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[6]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[7]
+; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -465,6 +1095,75 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
+; NONEON-NOSVE-NEXT:    umov w16, v0.b[7]
+; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
+; NONEON-NOSVE-NEXT:    umov w18, v0.b[9]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[10]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[11]
+; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[12]
+; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    umov w15, v1.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[13]
+; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
+; NONEON-NOSVE-NEXT:    umov w16, v1.b[8]
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
+; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
+; NONEON-NOSVE-NEXT:    umov w17, v1.b[9]
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
+; NONEON-NOSVE-NEXT:    udiv w8, w18, w17
+; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[11]
+; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[14]
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[15]
+; NONEON-NOSVE-NEXT:    udiv w8, w13, w12
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[14]
+; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[15]
+; NONEON-NOSVE-NEXT:    udiv w9, w12, w9
+; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
+; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -545,6 +1244,163 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
+; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
+; NONEON-NOSVE-NEXT:    umov w2, v0.b[10]
+; NONEON-NOSVE-NEXT:    umov w3, v0.b[11]
+; NONEON-NOSVE-NEXT:    umov w4, v0.b[12]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    umov w5, v0.b[13]
+; NONEON-NOSVE-NEXT:    umov w6, v0.b[14]
+; NONEON-NOSVE-NEXT:    umov w1, v3.b[1]
+; NONEON-NOSVE-NEXT:    umov w7, v2.b[0]
+; NONEON-NOSVE-NEXT:    umov w19, v2.b[2]
+; NONEON-NOSVE-NEXT:    umov w20, v2.b[3]
+; NONEON-NOSVE-NEXT:    umov w21, v2.b[4]
+; NONEON-NOSVE-NEXT:    umov w22, v2.b[5]
+; NONEON-NOSVE-NEXT:    umov w23, v2.b[6]
+; NONEON-NOSVE-NEXT:    umov w24, v2.b[7]
+; NONEON-NOSVE-NEXT:    umov w25, v2.b[8]
+; NONEON-NOSVE-NEXT:    umov w26, v2.b[9]
+; NONEON-NOSVE-NEXT:    umov w27, v2.b[10]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    udiv w11, w11, w10
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s5, w9
+; NONEON-NOSVE-NEXT:    umov w9, v3.b[11]
+; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w10, w12, w10
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
+; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
+; NONEON-NOSVE-NEXT:    udiv w13, w13, w12
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
+; NONEON-NOSVE-NEXT:    umov w10, v3.b[12]
+; NONEON-NOSVE-NEXT:    udiv w12, w14, w12
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
+; NONEON-NOSVE-NEXT:    umov w13, v2.b[14]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[7]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[7]
+; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
+; NONEON-NOSVE-NEXT:    umov w12, v2.b[13]
+; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    umov w15, v1.b[8]
+; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
+; NONEON-NOSVE-NEXT:    udiv w18, w17, w15
+; NONEON-NOSVE-NEXT:    umov w15, v1.b[9]
+; NONEON-NOSVE-NEXT:    umov w17, v0.b[9]
+; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
+; NONEON-NOSVE-NEXT:    udiv w17, w17, w15
+; NONEON-NOSVE-NEXT:    umov w15, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
+; NONEON-NOSVE-NEXT:    udiv w15, w2, w15
+; NONEON-NOSVE-NEXT:    umov w2, v1.b[11]
+; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
+; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
+; NONEON-NOSVE-NEXT:    umov w3, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
+; NONEON-NOSVE-NEXT:    udiv w3, w4, w3
+; NONEON-NOSVE-NEXT:    umov w4, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
+; NONEON-NOSVE-NEXT:    udiv w4, w5, w4
+; NONEON-NOSVE-NEXT:    umov w5, v1.b[14]
+; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
+; NONEON-NOSVE-NEXT:    udiv w5, w6, w5
+; NONEON-NOSVE-NEXT:    umov w6, v2.b[1]
+; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
+; NONEON-NOSVE-NEXT:    udiv w1, w6, w1
+; NONEON-NOSVE-NEXT:    umov w6, v3.b[0]
+; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
+; NONEON-NOSVE-NEXT:    udiv w6, w7, w6
+; NONEON-NOSVE-NEXT:    umov w7, v3.b[2]
+; NONEON-NOSVE-NEXT:    udiv w7, w19, w7
+; NONEON-NOSVE-NEXT:    umov w19, v3.b[3]
+; NONEON-NOSVE-NEXT:    fmov s4, w6
+; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
+; NONEON-NOSVE-NEXT:    udiv w19, w20, w19
+; NONEON-NOSVE-NEXT:    umov w20, v3.b[4]
+; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
+; NONEON-NOSVE-NEXT:    udiv w20, w21, w20
+; NONEON-NOSVE-NEXT:    umov w21, v3.b[5]
+; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
+; NONEON-NOSVE-NEXT:    udiv w21, w22, w21
+; NONEON-NOSVE-NEXT:    umov w22, v3.b[6]
+; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w22, w23, w22
+; NONEON-NOSVE-NEXT:    umov w23, v3.b[7]
+; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
+; NONEON-NOSVE-NEXT:    udiv w23, w24, w23
+; NONEON-NOSVE-NEXT:    umov w24, v3.b[8]
+; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w24, w25, w24
+; NONEON-NOSVE-NEXT:    umov w25, v3.b[9]
+; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
+; NONEON-NOSVE-NEXT:    udiv w25, w26, w25
+; NONEON-NOSVE-NEXT:    umov w26, v3.b[10]
+; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w8, w27, w26
+; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
+; NONEON-NOSVE-NEXT:    umov w11, v2.b[12]
+; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
+; NONEON-NOSVE-NEXT:    umov w8, v3.b[15]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v3.b[13]
+; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[15]
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v3.b[14]
+; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[15]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v2.b[15]
+; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
+; NONEON-NOSVE-NEXT:    udiv w8, w13, w8
+; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = udiv <32 x i8> %op1, %op2
@@ -563,6 +1419,22 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -579,6 +1451,29 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -605,6 +1500,43 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[6]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
+; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -645,6 +1577,79 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[6]
+; NONEON-NOSVE-NEXT:    umov w16, v2.h[1]
+; NONEON-NOSVE-NEXT:    umov w17, v2.h[0]
+; NONEON-NOSVE-NEXT:    umov w18, v2.h[2]
+; NONEON-NOSVE-NEXT:    umov w1, v2.h[3]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w2, v2.h[4]
+; NONEON-NOSVE-NEXT:    umov w3, v2.h[5]
+; NONEON-NOSVE-NEXT:    umov w4, v2.h[6]
+; NONEON-NOSVE-NEXT:    udiv w10, w10, w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[2]
+; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s5, w10
+; NONEON-NOSVE-NEXT:    umov w10, v3.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
+; NONEON-NOSVE-NEXT:    umov w9, v2.h[7]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
+; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    umov w14, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
+; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    umov w15, v3.h[1]
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
+; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
+; NONEON-NOSVE-NEXT:    umov w16, v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
+; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
+; NONEON-NOSVE-NEXT:    umov w17, v3.h[2]
+; NONEON-NOSVE-NEXT:    udiv w17, w18, w17
+; NONEON-NOSVE-NEXT:    umov w18, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmov s4, w16
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
+; NONEON-NOSVE-NEXT:    udiv w18, w1, w18
+; NONEON-NOSVE-NEXT:    umov w1, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
+; NONEON-NOSVE-NEXT:    udiv w1, w2, w1
+; NONEON-NOSVE-NEXT:    umov w2, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
+; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
+; NONEON-NOSVE-NEXT:    umov w3, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
+; NONEON-NOSVE-NEXT:    udiv w8, w4, w3
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
+; NONEON-NOSVE-NEXT:    udiv w9, w9, w10
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = udiv <16 x i16> %op1, %op2
@@ -661,6 +1666,21 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -674,6 +1694,26 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w10, s0
+; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    fmov w9, s1
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -689,6 +1729,45 @@ define void @udiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w10, s0
+; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
+; NONEON-NOSVE-NEXT:    fmov w13, s2
+; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
+; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    fmov w9, s1
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    fmov w12, s3
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
+; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w12
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
+; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
+; NONEON-NOSVE-NEXT:    fmov s1, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
+; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
+; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
+; NONEON-NOSVE-NEXT:    udiv w8, w16, w15
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = udiv <8 x i32> %op1, %op2
@@ -705,6 +1784,16 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -718,6 +1807,18 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -733,6 +1834,29 @@ define void @udiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    udiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x11, d2
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
+; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
+; NONEON-NOSVE-NEXT:    fmov x10, d3
+; NONEON-NOSVE-NEXT:    udiv x10, x11, x10
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    udiv x11, x12, x11
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = udiv <4 x i64> %op1, %op2
@@ -778,6 +1902,27 @@ define void @udiv_constantsplat_v8i32(ptr %a)  {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #6
 ; SVE2-NEXT:    stp q1, q0, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #8969 // =0x2309
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    movk w8, #22765, lsl #16
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    umull2 v3.2d, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umull v4.2d, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    umull2 v5.2d, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umull v0.2d, v2.2s, v0.2s
+; NONEON-NOSVE-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v5.4s
+; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub v2.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    usra v3.4s, v1.4s, #1
+; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #1
+; NONEON-NOSVE-NEXT:    ushr v1.4s, v3.4s, #6
+; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #6
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
   store <8 x i32> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index e40668a8696ee2..9f8511b00c6ed1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -26,6 +27,22 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) {
 ; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i1_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
+; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i1> %a to <8 x i32>
   store <8 x i32> %b, ptr %out
   ret void
@@ -52,6 +69,22 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) {
 ; CHECK-NEXT:    asr z0.d, z0.d, #61
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i3_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #61
+; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #61
+; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #61
+; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #61
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i3> %a to <4 x i64>
   store <4 x i64> %b, ptr %out
   ret void
@@ -70,6 +103,17 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i8_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
   ret void
@@ -91,6 +135,24 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v32i8_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i16>
@@ -112,6 +174,18 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i8_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -133,6 +207,25 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i8_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
   ret void
@@ -167,6 +260,40 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v32i8_v32i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
+; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v6.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v7.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i32>
@@ -194,6 +321,22 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i8_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #56
+; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #56
+; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #56
+; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #56
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -216,6 +359,26 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i8_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -253,6 +416,41 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q1, q4, [x0, #32]
 ; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i8_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
   ret void
@@ -321,6 +519,73 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q0, q2, [x1, #224]
 ; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v32i8_v32i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    sshll v5.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v6.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v3.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v4.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
+; NONEON-NOSVE-NEXT:    sshll v6.4s, v6.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v7.4s, v7.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
+; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v19.2d, v19.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v16.2d, v16.2s, #0
+; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v7.2d, v22.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
+; NONEON-NOSVE-NEXT:    sshll v6.2d, v23.2s, #0
+; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v20.2s, #0
+; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v21.2s, #0
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v17.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v18.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i64>
@@ -341,6 +606,17 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -361,6 +637,24 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i32>
@@ -382,6 +676,18 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -403,6 +709,25 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -437,6 +762,40 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i64>
@@ -457,6 +816,17 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -477,6 +847,24 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
   %c = sext <8 x i32> %b to <8 x i64>
@@ -497,6 +885,17 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i8_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
   ret void
@@ -518,6 +917,24 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v32i8_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i16>
@@ -539,6 +956,18 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i8_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -560,6 +989,25 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i8_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
   ret void
@@ -594,6 +1042,40 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v32i8_v32i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v6.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v7.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i32>
@@ -619,6 +1101,20 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v4i8_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -641,6 +1137,26 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i8_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -678,6 +1194,41 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q1, q4, [x0, #32]
 ; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i8_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
   ret void
@@ -746,6 +1297,73 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q0, q2, [x1, #224]
 ; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v32i8_v32i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ushll v5.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v6.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v3.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v4.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
+; NONEON-NOSVE-NEXT:    ushll v6.4s, v6.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v7.4s, v7.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
+; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v19.2d, v19.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v16.2d, v16.2s, #0
+; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v7.2d, v22.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
+; NONEON-NOSVE-NEXT:    ushll v6.2d, v23.2s, #0
+; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v20.2s, #0
+; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v21.2s, #0
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v17.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v18.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i64>
@@ -766,6 +1384,17 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -786,6 +1415,24 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i32>
@@ -807,6 +1454,18 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v4i16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -828,6 +1487,25 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -862,6 +1540,40 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i64>
@@ -882,6 +1594,17 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v4i32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -902,6 +1625,24 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
   %c = zext <8 x i32> %b to <8 x i64>
@@ -928,6 +1669,21 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ; SVE2-NEXT:    mul z0.d, z1.d, z0.d
 ; SVE2-NEXT:    str q0, [x1]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extend_and_mul:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v1.2s, w0
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    fmov x11, d1
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
   %4 = zext <2 x i32> %broadcast.splat3 to <2 x i64>
@@ -943,6 +1699,13 @@ define void @extend_no_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extend_no_mul:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index 54276bb4ba01d2..ade60b07150ce2 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -22,6 +23,15 @@ define void @add_v32i8(ptr %a) {
 ; CHECK-NEXT:    add z1.b, z1.b, #7 // =0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -38,6 +48,16 @@ define void @add_v16i16(ptr %a) {
 ; CHECK-NEXT:    add z1.h, z1.h, #15 // =0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -54,6 +74,16 @@ define void @add_v8i32(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z1.s, #31 // =0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -70,6 +100,16 @@ define void @add_v4i64(ptr %a) {
 ; CHECK-NEXT:    add z1.d, z1.d, #63 // =0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    add v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -90,6 +130,15 @@ define void @and_v32i8(ptr %a) {
 ; CHECK-NEXT:    and z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -106,6 +155,16 @@ define void @and_v16i16(ptr %a) {
 ; CHECK-NEXT:    and z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -122,6 +181,16 @@ define void @and_v8i32(ptr %a) {
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -138,6 +207,16 @@ define void @and_v4i64(ptr %a) {
 ; CHECK-NEXT:    and z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -158,6 +237,14 @@ define void @ashr_v32i8(ptr %a) {
 ; CHECK-NEXT:    asr z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -174,6 +261,14 @@ define void @ashr_v16i16(ptr %a) {
 ; CHECK-NEXT:    asr z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8h, v0.8h, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.8h, v1.8h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -190,6 +285,14 @@ define void @ashr_v8i32(ptr %a) {
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -206,6 +309,14 @@ define void @ashr_v4i64(ptr %a) {
 ; CHECK-NEXT:    asr z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2d, v0.2d, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.2d, v1.2d, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -229,6 +340,15 @@ define void @icmp_eq_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cmeq v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -249,6 +369,16 @@ define void @icmp_sge_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sge_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    cmge v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    cmge v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -269,6 +399,16 @@ define void @icmp_sgt_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sgt_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #-8 // =0xfffffff8
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    cmgt v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    cmgt v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 -8, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -289,6 +429,16 @@ define void @icmp_ult_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ult_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmhi v1.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -310,6 +460,14 @@ define void @lshr_v32i8(ptr %a) {
 ; CHECK-NEXT:    lsr z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    ushr v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -326,6 +484,14 @@ define void @lshr_v16i16(ptr %a) {
 ; CHECK-NEXT:    lsr z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v0.8h, v0.8h, #15
+; NONEON-NOSVE-NEXT:    ushr v1.8h, v1.8h, #15
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -342,6 +508,14 @@ define void @lshr_v8i32(ptr %a) {
 ; CHECK-NEXT:    lsr z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #31
+; NONEON-NOSVE-NEXT:    ushr v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -358,6 +532,14 @@ define void @lshr_v4i64(ptr %a) {
 ; CHECK-NEXT:    lsr z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v0.2d, v0.2d, #63
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v1.2d, #63
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -378,6 +560,15 @@ define void @mul_v32i8(ptr %a) {
 ; CHECK-NEXT:    mul z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    mul v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mul v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -394,6 +585,16 @@ define void @mul_v16i16(ptr %a) {
 ; CHECK-NEXT:    mul z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    mul v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    mul v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -410,6 +611,16 @@ define void @mul_v8i32(ptr %a) {
 ; CHECK-NEXT:    mul z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    mul v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    mul v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -426,6 +637,28 @@ define void @mul_v4i64(ptr %a) {
 ; CHECK-NEXT:    mul z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    fmov x11, d1
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    lsl x12, x10, #6
+; NONEON-NOSVE-NEXT:    lsl x13, x11, #6
+; NONEON-NOSVE-NEXT:    lsl x14, x8, #6
+; NONEON-NOSVE-NEXT:    sub x10, x12, x10
+; NONEON-NOSVE-NEXT:    sub x11, x13, x11
+; NONEON-NOSVE-NEXT:    lsl x12, x9, #6
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    fmov d1, x11
+; NONEON-NOSVE-NEXT:    sub x8, x14, x8
+; NONEON-NOSVE-NEXT:    sub x9, x12, x9
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x9
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -446,6 +679,15 @@ define void @or_v32i8(ptr %a) {
 ; CHECK-NEXT:    orr z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -462,6 +704,16 @@ define void @or_v16i16(ptr %a) {
 ; CHECK-NEXT:    orr z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -478,6 +730,16 @@ define void @or_v8i32(ptr %a) {
 ; CHECK-NEXT:    orr z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -494,6 +756,16 @@ define void @or_v4i64(ptr %a) {
 ; CHECK-NEXT:    orr z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -514,6 +786,14 @@ define void @shl_v32i8(ptr %a) {
 ; CHECK-NEXT:    lsl z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -530,6 +810,14 @@ define void @shl_v16i16(ptr %a) {
 ; CHECK-NEXT:    lsl z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    shl v0.8h, v0.8h, #15
+; NONEON-NOSVE-NEXT:    shl v1.8h, v1.8h, #15
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -546,6 +834,14 @@ define void @shl_v8i32(ptr %a) {
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
+; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -562,6 +858,14 @@ define void @shl_v4i64(ptr %a) {
 ; CHECK-NEXT:    lsl z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #63
+; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #63
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -582,6 +886,15 @@ define void @smax_v32i8(ptr %a) {
 ; CHECK-NEXT:    smax z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smax v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smax v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -598,6 +911,16 @@ define void @smax_v16i16(ptr %a) {
 ; CHECK-NEXT:    smax z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    smax v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smax v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -614,6 +937,16 @@ define void @smax_v8i32(ptr %a) {
 ; CHECK-NEXT:    smax z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    smax v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smax v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -630,6 +963,18 @@ define void @smax_v4i64(ptr %a) {
 ; CHECK-NEXT:    smax z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmgt v3.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmgt v4.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -650,6 +995,15 @@ define void @smin_v32i8(ptr %a) {
 ; CHECK-NEXT:    smin z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smin v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smin v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -666,6 +1020,16 @@ define void @smin_v16i16(ptr %a) {
 ; CHECK-NEXT:    smin z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    smin v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smin v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -682,6 +1046,16 @@ define void @smin_v8i32(ptr %a) {
 ; CHECK-NEXT:    smin z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    smin v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smin v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -698,6 +1072,18 @@ define void @smin_v4i64(ptr %a) {
 ; CHECK-NEXT:    smin z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmgt v3.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -718,6 +1104,15 @@ define void @sub_v32i8(ptr %a) {
 ; CHECK-NEXT:    sub z1.b, z1.b, #7 // =0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -734,6 +1129,16 @@ define void @sub_v16i16(ptr %a) {
 ; CHECK-NEXT:    sub z1.h, z1.h, #15 // =0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    sub v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -750,6 +1155,16 @@ define void @sub_v8i32(ptr %a) {
 ; CHECK-NEXT:    sub z1.s, z1.s, #31 // =0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -766,6 +1181,16 @@ define void @sub_v4i64(ptr %a) {
 ; CHECK-NEXT:    sub z1.d, z1.d, #63 // =0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    sub v1.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -786,6 +1211,15 @@ define void @umax_v32i8(ptr %a) {
 ; CHECK-NEXT:    umax z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umax v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umax v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -802,6 +1236,16 @@ define void @umax_v16i16(ptr %a) {
 ; CHECK-NEXT:    umax z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    umax v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umax v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -818,6 +1262,16 @@ define void @umax_v8i32(ptr %a) {
 ; CHECK-NEXT:    umax z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    umax v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umax v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -834,6 +1288,18 @@ define void @umax_v4i64(ptr %a) {
 ; CHECK-NEXT:    umax z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmhi v3.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmhi v4.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -854,6 +1320,15 @@ define void @umin_v32i8(ptr %a) {
 ; CHECK-NEXT:    umin z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umin v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umin v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -870,6 +1345,16 @@ define void @umin_v16i16(ptr %a) {
 ; CHECK-NEXT:    umin z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    umin v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umin v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -886,6 +1371,16 @@ define void @umin_v8i32(ptr %a) {
 ; CHECK-NEXT:    umin z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    umin v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umin v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -902,6 +1397,18 @@ define void @umin_v4i64(ptr %a) {
 ; CHECK-NEXT:    umin z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmhi v3.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -922,6 +1429,15 @@ define void @xor_v32i8(ptr %a) {
 ; CHECK-NEXT:    eor z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -938,6 +1454,16 @@ define void @xor_v16i16(ptr %a) {
 ; CHECK-NEXT:    eor z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -954,6 +1480,16 @@ define void @xor_v8i32(ptr %a) {
 ; CHECK-NEXT:    eor z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -970,6 +1506,16 @@ define void @xor_v4i64(ptr %a) {
 ; CHECK-NEXT:    eor z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
index 40824ba9ae9c5f..4fc7ec3a8439df 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,11 @@ define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -28,6 +34,11 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -41,6 +52,15 @@ define void @and_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = and <32 x i8> %op1, %op2
@@ -56,6 +76,11 @@ define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -68,6 +93,11 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -81,6 +111,15 @@ define void @and_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = and <16 x i16> %op1, %op2
@@ -96,6 +135,11 @@ define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -108,6 +152,11 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -121,6 +170,15 @@ define void @and_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = and <8 x i32> %op1, %op2
@@ -136,6 +194,11 @@ define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -148,6 +211,11 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -161,6 +229,15 @@ define void @and_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = and <4 x i64> %op1, %op2
@@ -180,6 +257,11 @@ define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -192,6 +274,11 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -205,6 +292,15 @@ define void @or_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = or <32 x i8> %op1, %op2
@@ -220,6 +316,11 @@ define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -232,6 +333,11 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -245,6 +351,15 @@ define void @or_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = or <16 x i16> %op1, %op2
@@ -260,6 +375,11 @@ define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -272,6 +392,11 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -285,6 +410,15 @@ define void @or_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = or <8 x i32> %op1, %op2
@@ -300,6 +434,11 @@ define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -312,6 +451,11 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -325,6 +469,15 @@ define void @or_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = or <4 x i64> %op1, %op2
@@ -344,6 +497,11 @@ define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -356,6 +514,11 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -369,6 +532,15 @@ define void @xor_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = xor <32 x i8> %op1, %op2
@@ -384,6 +556,11 @@ define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -396,6 +573,11 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -409,6 +591,15 @@ define void @xor_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = xor <16 x i16> %op1, %op2
@@ -424,6 +615,11 @@ define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -436,6 +632,11 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -449,6 +650,15 @@ define void @xor_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = xor <8 x i32> %op1, %op2
@@ -464,6 +674,11 @@ define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -476,6 +691,11 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -489,6 +709,15 @@ define void @xor_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = xor <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index 74ee5482a60c41..b9c859a58611e8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,11 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -30,6 +36,11 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -45,6 +56,15 @@ define void @smax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smax v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -61,6 +81,11 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -74,6 +99,11 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -89,6 +119,15 @@ define void @smax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smax v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -105,6 +144,11 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -118,6 +162,11 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -133,6 +182,15 @@ define void @smax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -150,6 +208,12 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -164,6 +228,12 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmgt v2.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -179,6 +249,18 @@ define void @smax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v4.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmgt v5.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -199,6 +281,11 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -212,6 +299,11 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -227,6 +319,15 @@ define void @smin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smin v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -243,6 +344,11 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -256,6 +362,11 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -271,6 +382,15 @@ define void @smin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smin v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -287,6 +407,11 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -300,6 +425,11 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -315,6 +445,15 @@ define void @smin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -332,6 +471,12 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -346,6 +491,12 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -361,6 +512,18 @@ define void @smin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmgt v5.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -381,6 +544,11 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -394,6 +562,11 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -409,6 +582,15 @@ define void @umax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umax v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -425,6 +607,11 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -438,6 +625,11 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -453,6 +645,15 @@ define void @umax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umax v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -469,6 +670,11 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -482,6 +688,11 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -497,6 +708,15 @@ define void @umax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -514,6 +734,12 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -528,6 +754,12 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmhi v2.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -543,6 +775,18 @@ define void @umax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmhi v4.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmhi v5.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -563,6 +807,11 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -576,6 +825,11 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -591,6 +845,15 @@ define void @umin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umin v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -607,6 +870,11 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -620,6 +888,11 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -635,6 +908,15 @@ define void @umin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umin v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -651,6 +933,11 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -664,6 +951,11 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -679,6 +971,15 @@ define void @umin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -696,6 +997,12 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -710,6 +1017,12 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -725,6 +1038,18 @@ define void @umin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmhi v5.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
index 3ff6983210a0a3..3a03de3442d581 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -20,6 +21,12 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
 ; NO-FA64-NEXT:    mad z0.b, p0/m, z1.b, z2.b
 ; NO-FA64-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; NO-FA64-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mla8xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mla v2.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = mul <8 x i8> %A, %B;
   %tmp2 = add <8 x i8> %C, %tmp1;
   ret <8 x i8> %tmp2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index 8917f43002daf9..1ed3d8fa39d8da 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 ; This test only tests the legal types for a given vector width, as mulh nodes
 ; do not get generated for non-legal types.
@@ -36,6 +37,16 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    lsr z0.h, z0.h, #4
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 4, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
   %1 = sext <4 x i8> %op1 to <4 x i16>
@@ -63,6 +74,12 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 8, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   %1 = sext <8 x i8> %op1 to <8 x i16>
@@ -90,6 +107,13 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <16 x i8> %op1 to <16 x i16>
   %2 = sext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
@@ -118,6 +142,19 @@ define void @smulh_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smull2 v4.8h, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smull v0.8h, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    smull2 v1.8h, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    smull v2.8h, v2.8b, v3.8b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
+; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %1 = sext <32 x i8> %op1 to <32 x i16>
@@ -153,6 +190,16 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #16
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i16> %op1 to <2 x i32>
   %2 = sext <2 x i16> %op2 to <2 x i32>
   %mul = mul <2 x i32> %1, %2
@@ -178,6 +225,12 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i16> %op1 to <4 x i32>
   %2 = sext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -203,6 +256,13 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <8 x i16> %op1 to <8 x i32>
   %2 = sext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
@@ -231,6 +291,19 @@ define void @smulh_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smull2 v4.4s, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smull v0.4s, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    smull2 v1.4s, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    smull v2.4s, v2.4h, v3.4h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %1 = sext <16 x i16> %op1 to <16 x i32>
@@ -259,6 +332,12 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i32> %op1 to <2 x i64>
   %2 = sext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -284,6 +363,13 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i32> %op1 to <4 x i64>
   %2 = sext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
@@ -312,6 +398,19 @@ define void @smulh_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smull2 v4.2d, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smull v0.2d, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    smull2 v1.2d, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    smull v2.2d, v2.2s, v3.2s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %1 = sext <8 x i32> %op1 to <8 x i64>
@@ -340,6 +439,16 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    fmov x9, d1
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i128> undef, i128 64, i128 0
   %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
   %1 = sext <1 x i64> %op1 to <1 x i128>
@@ -367,6 +476,19 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    fmov x11, d1
+; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -395,6 +517,31 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x12, d3
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov x9, d2
+; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
+; NONEON-NOSVE-NEXT:    smulh x9, x9, x12
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    smulh x11, x13, x14
+; NONEON-NOSVE-NEXT:    fmov d1, x10
+; NONEON-NOSVE-NEXT:    fmov d2, x9
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    fmov d3, x11
+; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %1 = sext <4 x i64> %op1 to <4 x i128>
@@ -433,6 +580,15 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    lsr z0.h, z0.h, #4
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i8> %op1 to <4 x i16>
   %2 = zext <4 x i8> %op2 to <4 x i16>
   %mul = mul <4 x i16> %1, %2
@@ -458,6 +614,12 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i8> %op1 to <8 x i16>
   %2 = zext <8 x i8> %op2 to <8 x i16>
   %mul = mul <8 x i16> %1, %2
@@ -483,6 +645,13 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <16 x i8> %op1 to <16 x i16>
   %2 = zext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
@@ -511,6 +680,19 @@ define void @umulh_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umull2 v4.8h, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umull v0.8h, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    umull2 v1.8h, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    umull v2.8h, v2.8b, v3.8b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
+; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %1 = zext <32 x i8> %op1 to <32 x i16>
@@ -545,6 +727,15 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #16
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i16> %op1 to <2 x i32>
   %2 = zext <2 x i16> %op2 to <2 x i32>
   %mul = mul <2 x i32> %1, %2
@@ -570,6 +761,12 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i16> %op1 to <4 x i32>
   %2 = zext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -595,6 +792,13 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i16> %op1 to <8 x i32>
   %2 = zext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
@@ -623,6 +827,19 @@ define void @umulh_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umull2 v4.4s, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umull v0.4s, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    umull2 v1.4s, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    umull v2.4s, v2.4h, v3.4h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %1 = zext <16 x i16> %op1 to <16 x i32>
@@ -651,6 +868,12 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i32> %op1 to <2 x i64>
   %2 = zext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -676,6 +899,13 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i32> %op1 to <4 x i64>
   %2 = zext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
@@ -704,6 +934,19 @@ define void @umulh_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umull2 v4.2d, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umull v0.2d, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    umull2 v1.2d, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    umull v2.2d, v2.2s, v3.2s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %insert = insertelement <8 x i64> undef, i64 32, i64 0
@@ -734,6 +977,16 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    fmov x9, d1
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <1 x i64> %op1 to <1 x i128>
   %2 = zext <1 x i64> %op2 to <1 x i128>
   %mul = mul <1 x i128> %1, %2
@@ -759,6 +1012,19 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    fmov x11, d1
+; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -787,6 +1053,31 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x12, d3
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov x9, d2
+; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x12
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    umulh x11, x13, x14
+; NONEON-NOSVE-NEXT:    fmov d1, x10
+; NONEON-NOSVE-NEXT:    fmov d2, x9
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    fmov d3, x11
+; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %1 = zext <4 x i64> %op1 to <4 x i128>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 1123907f338993..ad75ba62e17cf8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,12 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -30,6 +37,12 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -44,6 +57,14 @@ define i8 @uaddv_v32i8(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    addv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -58,6 +79,12 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -71,6 +98,12 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -85,6 +118,14 @@ define i16 @uaddv_v16i16(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -99,6 +140,12 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -112,6 +159,12 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -126,6 +179,14 @@ define i32 @uaddv_v8i32(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    addv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -139,6 +200,12 @@ define i64 @uaddv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addp d0, v0.2d
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -152,6 +219,14 @@ define i64 @uaddv_v4i64(ptr %a) {
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    addp d0, v0.2d
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -169,6 +244,12 @@ define i8 @smaxv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -181,6 +262,12 @@ define i8 @smaxv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -194,6 +281,14 @@ define i8 @smaxv_v32i8(ptr %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -207,6 +302,12 @@ define i16 @smaxv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -219,6 +320,12 @@ define i16 @smaxv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -232,6 +339,14 @@ define i16 @smaxv_v16i16(ptr %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -245,6 +360,12 @@ define i32 @smaxv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -257,6 +378,12 @@ define i32 @smaxv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -270,6 +397,14 @@ define i32 @smaxv_v8i32(ptr %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -284,6 +419,17 @@ define i64 @smaxv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -297,6 +443,20 @@ define i64 @smaxv_v4i64(ptr %a) {
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -314,6 +474,12 @@ define i8 @sminv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -326,6 +492,12 @@ define i8 @sminv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -339,6 +511,14 @@ define i8 @sminv_v32i8(ptr %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -352,6 +532,12 @@ define i16 @sminv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -364,6 +550,12 @@ define i16 @sminv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -377,6 +569,14 @@ define i16 @sminv_v16i16(ptr %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -390,6 +590,12 @@ define i32 @sminv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -402,6 +608,12 @@ define i32 @sminv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -415,6 +627,14 @@ define i32 @sminv_v8i32(ptr %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -429,6 +649,17 @@ define i64 @sminv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -442,6 +673,20 @@ define i64 @sminv_v4i64(ptr %a) {
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -459,6 +704,12 @@ define i8 @umaxv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -471,6 +722,12 @@ define i8 @umaxv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -484,6 +741,14 @@ define i8 @umaxv_v32i8(ptr %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -497,6 +762,12 @@ define i16 @umaxv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -509,6 +780,12 @@ define i16 @umaxv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -522,6 +799,14 @@ define i16 @umaxv_v16i16(ptr %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -535,6 +820,12 @@ define i32 @umaxv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -547,6 +838,12 @@ define i32 @umaxv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -560,6 +857,14 @@ define i32 @umaxv_v8i32(ptr %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -574,6 +879,17 @@ define i64 @umaxv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -587,6 +903,20 @@ define i64 @umaxv_v4i64(ptr %a) {
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -604,6 +934,12 @@ define i8 @uminv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -616,6 +952,12 @@ define i8 @uminv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -629,6 +971,14 @@ define i8 @uminv_v32i8(ptr %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -642,6 +992,12 @@ define i16 @uminv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -654,6 +1010,12 @@ define i16 @uminv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -667,6 +1029,14 @@ define i16 @uminv_v16i16(ptr %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -680,6 +1050,12 @@ define i32 @uminv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -692,6 +1068,12 @@ define i32 @uminv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -705,6 +1087,14 @@ define i32 @uminv_v8i32(ptr %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -719,6 +1109,17 @@ define i64 @uminv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -732,6 +1133,20 @@ define i64 @uminv_v4i64(ptr %a) {
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
   ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 4ae7586fca1692..99f8aef9f2b22d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,6 +25,35 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -53,6 +83,53 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
+; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
+; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[7]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[6]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
+; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -102,6 +179,112 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
+; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
+; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    smov w7, v1.b[6]
+; NONEON-NOSVE-NEXT:    smov w19, v0.b[6]
+; NONEON-NOSVE-NEXT:    smov w21, v1.b[7]
+; NONEON-NOSVE-NEXT:    smov w22, v0.b[7]
+; NONEON-NOSVE-NEXT:    smov w24, v1.b[8]
+; NONEON-NOSVE-NEXT:    smov w25, v0.b[8]
+; NONEON-NOSVE-NEXT:    smov w27, v1.b[9]
+; NONEON-NOSVE-NEXT:    smov w28, v0.b[9]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[11]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[10]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[11]
+; NONEON-NOSVE-NEXT:    smov w16, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
+; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    smov w17, v0.b[12]
+; NONEON-NOSVE-NEXT:    smov w0, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
+; NONEON-NOSVE-NEXT:    sdiv w6, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    smov w1, v0.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
+; NONEON-NOSVE-NEXT:    sdiv w20, w19, w7
+; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w23, w22, w21
+; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
+; NONEON-NOSVE-NEXT:    sdiv w26, w25, w24
+; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w28, w27
+; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
+; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
+; NONEON-NOSVE-NEXT:    sdiv w15, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[14]
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[14]
+; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
+; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
+; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[15]
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[15]
+; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w1, w0
+; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
+; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
+; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -189,6 +372,279 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #320
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w4, v3.b[1]
+; NONEON-NOSVE-NEXT:    smov w1, v2.b[1]
+; NONEON-NOSVE-NEXT:    smov w7, v3.b[7]
+; NONEON-NOSVE-NEXT:    smov w5, v2.b[7]
+; NONEON-NOSVE-NEXT:    smov w6, v3.b[8]
+; NONEON-NOSVE-NEXT:    smov w3, v2.b[8]
+; NONEON-NOSVE-NEXT:    smov w22, v3.b[9]
+; NONEON-NOSVE-NEXT:    smov w20, v2.b[9]
+; NONEON-NOSVE-NEXT:    smov w13, v3.b[0]
+; NONEON-NOSVE-NEXT:    smov w17, v3.b[3]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[0]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w14, v2.b[3]
+; NONEON-NOSVE-NEXT:    smov w15, v3.b[4]
+; NONEON-NOSVE-NEXT:    smov w12, v2.b[4]
+; NONEON-NOSVE-NEXT:    smov w2, v3.b[5]
+; NONEON-NOSVE-NEXT:    smov w18, v2.b[5]
+; NONEON-NOSVE-NEXT:    smov w0, v3.b[6]
+; NONEON-NOSVE-NEXT:    smov w16, v2.b[6]
+; NONEON-NOSVE-NEXT:    smov w21, v3.b[10]
+; NONEON-NOSVE-NEXT:    smov w19, v2.b[10]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[2]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[2]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[3]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[3]
+; NONEON-NOSVE-NEXT:    sdiv w26, w14, w17
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[4]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[4]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[5]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[5]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[6]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[6]
+; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[7]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[7]
+; NONEON-NOSVE-NEXT:    sdiv w25, w12, w15
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[8]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[9]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[9]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[10]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[10]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[11]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[11]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[12]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[12]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[13]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[13]
+; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w11, v3.b[2]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[14]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[14]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v2.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w8, w1, w4
+; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w10, v2.b[0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w8, w5, w7
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w8, w3, w6
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w8, w20, w22
+; NONEON-NOSVE-NEXT:    sdiv w24, w10, w13
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s4, w8
+; NONEON-NOSVE-NEXT:    sdiv w23, w9, w11
+; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s5, w10
+; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
+; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
+; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w28, w18, w2
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    smov w10, v3.b[11]
+; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
+; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
+; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
+; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
+; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w27, w16, w0
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
+; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
+; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
+; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w4, w19, w21
+; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
+; NONEON-NOSVE-NEXT:    smov w12, v3.b[12]
+; NONEON-NOSVE-NEXT:    smov w14, v2.b[12]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w13, w11, w10
+; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
+; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
+; NONEON-NOSVE-NEXT:    smov w16, v3.b[13]
+; NONEON-NOSVE-NEXT:    smov w17, v2.b[13]
+; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
+; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
+; NONEON-NOSVE-NEXT:    sdiv w15, w14, w12
+; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
+; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
+; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
+; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
+; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
+; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
+; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
+; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
+; NONEON-NOSVE-NEXT:    smov w0, v3.b[14]
+; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
+; NONEON-NOSVE-NEXT:    smov w1, v2.b[14]
+; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
+; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
+; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
+; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
+; NONEON-NOSVE-NEXT:    sdiv w2, w1, w0
+; NONEON-NOSVE-NEXT:    smov w9, v3.b[15]
+; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
+; NONEON-NOSVE-NEXT:    smov w4, v2.b[15]
+; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
+; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
+; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w11, w4, w9
+; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
+; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[15]
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[15]
+; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
+; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w14, w13, w12
+; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
+; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
+; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
+; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
+; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #320
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = srem <32 x i8> %op1, %op2
@@ -210,6 +666,33 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -238,6 +721,51 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    smov w1, v1.h[4]
+; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
+; NONEON-NOSVE-NEXT:    smov w4, v1.h[5]
+; NONEON-NOSVE-NEXT:    smov w5, v0.h[5]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[6]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    smov w14, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -282,6 +810,139 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w20, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w21, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w19, v0.h[3]
+; NONEON-NOSVE-NEXT:    smov w5, v1.h[4]
+; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
+; NONEON-NOSVE-NEXT:    smov w1, v3.h[1]
+; NONEON-NOSVE-NEXT:    smov w23, v2.h[1]
+; NONEON-NOSVE-NEXT:    smov w25, v3.h[0]
+; NONEON-NOSVE-NEXT:    smov w26, v2.h[0]
+; NONEON-NOSVE-NEXT:    smov w6, v1.h[5]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[2]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w3, v0.h[5]
+; NONEON-NOSVE-NEXT:    smov w4, v1.h[6]
+; NONEON-NOSVE-NEXT:    smov w7, v0.h[6]
+; NONEON-NOSVE-NEXT:    smov w28, v3.h[2]
+; NONEON-NOSVE-NEXT:    smov w29, v2.h[2]
+; NONEON-NOSVE-NEXT:    smov w15, v3.h[3]
+; NONEON-NOSVE-NEXT:    smov w13, v2.h[3]
+; NONEON-NOSVE-NEXT:    smov w12, v3.h[4]
+; NONEON-NOSVE-NEXT:    smov w14, v3.h[5]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w11, w21, w20
+; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[3]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w11, v2.h[4]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
+; NONEON-NOSVE-NEXT:    sdiv w9, w19, w8
+; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w10, v3.h[6]
+; NONEON-NOSVE-NEXT:    fmov s5, w20
+; NONEON-NOSVE-NEXT:    smov w20, v3.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w8, w2, w5
+; NONEON-NOSVE-NEXT:    sdiv w24, w23, w1
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w27, w26, w25
+; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
+; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w9, w3, w6
+; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s4, w21
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
+; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
+; NONEON-NOSVE-NEXT:    sdiv w8, w7, w4
+; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
+; NONEON-NOSVE-NEXT:    smov w23, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w30, w29, w28
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w9, v2.h[5]
+; NONEON-NOSVE-NEXT:    smov w8, v2.h[6]
+; NONEON-NOSVE-NEXT:    sdiv w18, w13, w15
+; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
+; NONEON-NOSVE-NEXT:    sdiv w16, w11, w12
+; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
+; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w17, w9, w14
+; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
+; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
+; NONEON-NOSVE-NEXT:    sdiv w24, w8, w10
+; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
+; NONEON-NOSVE-NEXT:    sdiv w18, w23, w20
+; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w15, w13
+; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
+; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = srem <16 x i16> %op1, %op2
@@ -300,6 +961,23 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -315,6 +993,30 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov w11, s1
+; NONEON-NOSVE-NEXT:    fmov w12, s0
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
+; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -334,6 +1036,65 @@ define void @srem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov w12, s0
+; NONEON-NOSVE-NEXT:    fmov w3, s2
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w11, s1
+; NONEON-NOSVE-NEXT:    fmov w2, s3
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
+; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
+; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
+; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
+; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
+; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
+; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
+; NONEON-NOSVE-NEXT:    sdiv w4, w3, w2
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s1, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
+; NONEON-NOSVE-NEXT:    fmov s0, w12
+; NONEON-NOSVE-NEXT:    sdiv w1, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
+; NONEON-NOSVE-NEXT:    sdiv w7, w6, w5
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
+; NONEON-NOSVE-NEXT:    sdiv w21, w20, w19
+; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
+; NONEON-NOSVE-NEXT:    sdiv w9, w23, w22
+; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = srem <8 x i32> %op1, %op2
@@ -352,6 +1113,17 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -367,6 +1139,20 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -386,6 +1172,33 @@ define void @srem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    fmov x15, d2
+; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x14, d3
+; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
+; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    sdiv x16, x15, x14
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
+; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    sdiv x1, x18, x17
+; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = srem <4 x i64> %op1, %op2
@@ -413,6 +1226,41 @@ define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
+; NONEON-NOSVE-NEXT:    and w12, w12, #0xff
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
+; NONEON-NOSVE-NEXT:    and w14, w14, #0xff
+; NONEON-NOSVE-NEXT:    and w15, w15, #0xff
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    and w12, w17, #0xff
+; NONEON-NOSVE-NEXT:    and w13, w18, #0xff
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w13, w12
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w12, w13
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -442,6 +1290,53 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
+; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
+; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[7]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[6]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
+; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -491,6 +1386,112 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
+; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
+; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    umov w7, v1.b[6]
+; NONEON-NOSVE-NEXT:    umov w19, v0.b[6]
+; NONEON-NOSVE-NEXT:    umov w21, v1.b[7]
+; NONEON-NOSVE-NEXT:    umov w22, v0.b[7]
+; NONEON-NOSVE-NEXT:    umov w24, v1.b[8]
+; NONEON-NOSVE-NEXT:    umov w25, v0.b[8]
+; NONEON-NOSVE-NEXT:    umov w27, v1.b[9]
+; NONEON-NOSVE-NEXT:    umov w28, v0.b[9]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[11]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[10]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[11]
+; NONEON-NOSVE-NEXT:    umov w16, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
+; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    umov w17, v0.b[12]
+; NONEON-NOSVE-NEXT:    umov w0, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
+; NONEON-NOSVE-NEXT:    udiv w6, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    umov w1, v0.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
+; NONEON-NOSVE-NEXT:    udiv w20, w19, w7
+; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    udiv w23, w22, w21
+; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
+; NONEON-NOSVE-NEXT:    udiv w26, w25, w24
+; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w28, w27
+; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
+; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
+; NONEON-NOSVE-NEXT:    udiv w15, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[14]
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[14]
+; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
+; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
+; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[15]
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[15]
+; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w1, w0
+; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
+; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
+; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -578,6 +1579,279 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #320
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w4, v3.b[1]
+; NONEON-NOSVE-NEXT:    umov w1, v2.b[1]
+; NONEON-NOSVE-NEXT:    umov w7, v3.b[7]
+; NONEON-NOSVE-NEXT:    umov w5, v2.b[7]
+; NONEON-NOSVE-NEXT:    umov w6, v3.b[8]
+; NONEON-NOSVE-NEXT:    umov w3, v2.b[8]
+; NONEON-NOSVE-NEXT:    umov w22, v3.b[9]
+; NONEON-NOSVE-NEXT:    umov w20, v2.b[9]
+; NONEON-NOSVE-NEXT:    umov w13, v3.b[0]
+; NONEON-NOSVE-NEXT:    umov w17, v3.b[3]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[0]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w14, v2.b[3]
+; NONEON-NOSVE-NEXT:    umov w15, v3.b[4]
+; NONEON-NOSVE-NEXT:    umov w12, v2.b[4]
+; NONEON-NOSVE-NEXT:    umov w2, v3.b[5]
+; NONEON-NOSVE-NEXT:    umov w18, v2.b[5]
+; NONEON-NOSVE-NEXT:    umov w0, v3.b[6]
+; NONEON-NOSVE-NEXT:    umov w16, v2.b[6]
+; NONEON-NOSVE-NEXT:    umov w21, v3.b[10]
+; NONEON-NOSVE-NEXT:    umov w19, v2.b[10]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[2]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[2]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[3]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[3]
+; NONEON-NOSVE-NEXT:    udiv w26, w14, w17
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[4]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[4]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[5]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[5]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[6]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[6]
+; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[7]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[7]
+; NONEON-NOSVE-NEXT:    udiv w25, w12, w15
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[8]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[9]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[9]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[10]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[10]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[11]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[11]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[12]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[12]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[13]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[13]
+; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w11, v3.b[2]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[14]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[14]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v2.b[2]
+; NONEON-NOSVE-NEXT:    udiv w8, w1, w4
+; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w10, v2.b[0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w8, w5, w7
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w8, w3, w6
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w8, w20, w22
+; NONEON-NOSVE-NEXT:    udiv w24, w10, w13
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s4, w8
+; NONEON-NOSVE-NEXT:    udiv w23, w9, w11
+; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s5, w10
+; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
+; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
+; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w28, w18, w2
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    umov w10, v3.b[11]
+; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
+; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
+; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
+; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
+; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w27, w16, w0
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
+; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
+; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
+; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w4, w19, w21
+; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
+; NONEON-NOSVE-NEXT:    umov w12, v3.b[12]
+; NONEON-NOSVE-NEXT:    umov w14, v2.b[12]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w13, w11, w10
+; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
+; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
+; NONEON-NOSVE-NEXT:    umov w16, v3.b[13]
+; NONEON-NOSVE-NEXT:    umov w17, v2.b[13]
+; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
+; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
+; NONEON-NOSVE-NEXT:    udiv w15, w14, w12
+; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
+; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
+; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
+; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
+; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
+; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
+; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
+; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
+; NONEON-NOSVE-NEXT:    umov w0, v3.b[14]
+; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
+; NONEON-NOSVE-NEXT:    umov w1, v2.b[14]
+; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
+; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
+; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
+; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
+; NONEON-NOSVE-NEXT:    udiv w2, w1, w0
+; NONEON-NOSVE-NEXT:    umov w9, v3.b[15]
+; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
+; NONEON-NOSVE-NEXT:    umov w4, v2.b[15]
+; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
+; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
+; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w11, w4, w9
+; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
+; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[15]
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[15]
+; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
+; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w14, w13, w12
+; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
+; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
+; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
+; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
+; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #320
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = urem <32 x i8> %op1, %op2
@@ -599,6 +1873,33 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -627,6 +1928,51 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    umov w1, v1.h[4]
+; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
+; NONEON-NOSVE-NEXT:    umov w4, v1.h[5]
+; NONEON-NOSVE-NEXT:    umov w5, v0.h[5]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[6]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    umov w14, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -671,6 +2017,139 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w20, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w21, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w19, v0.h[3]
+; NONEON-NOSVE-NEXT:    umov w5, v1.h[4]
+; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
+; NONEON-NOSVE-NEXT:    umov w1, v3.h[1]
+; NONEON-NOSVE-NEXT:    umov w23, v2.h[1]
+; NONEON-NOSVE-NEXT:    umov w25, v3.h[0]
+; NONEON-NOSVE-NEXT:    umov w26, v2.h[0]
+; NONEON-NOSVE-NEXT:    umov w6, v1.h[5]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[2]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w3, v0.h[5]
+; NONEON-NOSVE-NEXT:    umov w4, v1.h[6]
+; NONEON-NOSVE-NEXT:    umov w7, v0.h[6]
+; NONEON-NOSVE-NEXT:    umov w28, v3.h[2]
+; NONEON-NOSVE-NEXT:    umov w29, v2.h[2]
+; NONEON-NOSVE-NEXT:    umov w15, v3.h[3]
+; NONEON-NOSVE-NEXT:    umov w13, v2.h[3]
+; NONEON-NOSVE-NEXT:    umov w12, v3.h[4]
+; NONEON-NOSVE-NEXT:    umov w14, v3.h[5]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w11, w21, w20
+; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[3]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w11, v2.h[4]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
+; NONEON-NOSVE-NEXT:    udiv w9, w19, w8
+; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w10, v3.h[6]
+; NONEON-NOSVE-NEXT:    fmov s5, w20
+; NONEON-NOSVE-NEXT:    umov w20, v3.h[7]
+; NONEON-NOSVE-NEXT:    udiv w8, w2, w5
+; NONEON-NOSVE-NEXT:    udiv w24, w23, w1
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w27, w26, w25
+; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
+; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w9, w3, w6
+; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s4, w21
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
+; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
+; NONEON-NOSVE-NEXT:    udiv w8, w7, w4
+; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
+; NONEON-NOSVE-NEXT:    umov w23, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w30, w29, w28
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w9, v2.h[5]
+; NONEON-NOSVE-NEXT:    umov w8, v2.h[6]
+; NONEON-NOSVE-NEXT:    udiv w18, w13, w15
+; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
+; NONEON-NOSVE-NEXT:    udiv w16, w11, w12
+; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
+; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[7]
+; NONEON-NOSVE-NEXT:    udiv w17, w9, w14
+; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
+; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
+; NONEON-NOSVE-NEXT:    udiv w24, w8, w10
+; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
+; NONEON-NOSVE-NEXT:    udiv w18, w23, w20
+; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w15, w13
+; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
+; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = urem <16 x i16> %op1, %op2
@@ -689,6 +2168,23 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -704,6 +2200,30 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov w11, s1
+; NONEON-NOSVE-NEXT:    fmov w12, s0
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
+; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -723,6 +2243,65 @@ define void @urem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov w12, s0
+; NONEON-NOSVE-NEXT:    fmov w3, s2
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w11, s1
+; NONEON-NOSVE-NEXT:    fmov w2, s3
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
+; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
+; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
+; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
+; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
+; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
+; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
+; NONEON-NOSVE-NEXT:    udiv w4, w3, w2
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s1, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
+; NONEON-NOSVE-NEXT:    fmov s0, w12
+; NONEON-NOSVE-NEXT:    udiv w1, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
+; NONEON-NOSVE-NEXT:    udiv w7, w6, w5
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
+; NONEON-NOSVE-NEXT:    udiv w21, w20, w19
+; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
+; NONEON-NOSVE-NEXT:    udiv w9, w23, w22
+; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = urem <8 x i32> %op1, %op2
@@ -741,6 +2320,17 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -756,6 +2346,20 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -775,6 +2379,33 @@ define void @urem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    fmov x15, d2
+; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x14, d3
+; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
+; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    udiv x16, x15, x14
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
+; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    udiv x1, x18, x17
+; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = urem <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index bfffe4b6315d74..0108fb580b947b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,14 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
 }
@@ -31,6 +40,14 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.8b, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
 }
@@ -46,6 +63,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.16b, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
 }
@@ -64,6 +89,20 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.b, p0, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.16b, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <32 x i8>, ptr %a
   %op2 = load volatile <32 x i8>, ptr %b
   %sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
@@ -83,6 +122,14 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
 }
@@ -99,6 +146,14 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
 }
@@ -115,6 +170,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.8h, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
 }
@@ -134,6 +197,20 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x i16>, ptr %a
   %op2 = load volatile <16 x i16>, ptr %b
   %sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
@@ -153,6 +230,14 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
 }
@@ -169,6 +254,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4s, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
 }
@@ -188,6 +281,20 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x i32>, ptr %a
   %op2 = load volatile <8 x i32>, ptr %b
   %sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
@@ -208,6 +315,14 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    fmov d2, x8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
 }
@@ -225,6 +340,14 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2d, x8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
 }
@@ -245,6 +368,20 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x i64>, ptr %a
   %op2 = load volatile <4 x i64>, ptr %b
   %sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index 9319bd69c25fb6..f7198e3042ad53 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,6 +20,16 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -32,6 +43,12 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -45,6 +62,12 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sshl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -60,6 +83,17 @@ define void @ashr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sshl v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sshl v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = ashr <32 x i8> %op1, %op2
@@ -78,6 +112,16 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -91,6 +135,12 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -104,6 +154,12 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -119,6 +175,17 @@ define void @ashr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sshl v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sshl v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = ashr <16 x i16> %op1, %op2
@@ -135,6 +202,12 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -148,6 +221,12 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -163,6 +242,17 @@ define void @ashr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sshl v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sshl v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = ashr <8 x i32> %op1, %op2
@@ -179,6 +269,12 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg d1, d1
+; NONEON-NOSVE-NEXT:    sshl d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -192,6 +288,12 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -207,6 +309,17 @@ define void @ashr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sshl v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sshl v1.2d, v3.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = ashr <4 x i64> %op1, %op2
@@ -229,6 +342,15 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -242,6 +364,12 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -255,6 +383,12 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -270,6 +404,17 @@ define void @lshr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushl v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ushl v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = lshr <32 x i8> %op1, %op2
@@ -288,6 +433,15 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -301,6 +455,12 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -314,6 +474,12 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -329,6 +495,17 @@ define void @lshr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ushl v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ushl v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = lshr <16 x i16> %op1, %op2
@@ -345,6 +522,12 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -358,6 +541,12 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -373,6 +562,17 @@ define void @lshr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ushl v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = lshr <8 x i32> %op1, %op2
@@ -389,6 +589,12 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg d1, d1
+; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -402,6 +608,12 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -417,6 +629,17 @@ define void @lshr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ushl v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ushl v1.2d, v3.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = lshr <4 x i64> %op1, %op2
@@ -438,6 +661,13 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v2i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x0000ff000000ff
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i8> %op1, %op2
   ret <2 x i8> %res
 }
@@ -452,6 +682,13 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -465,6 +702,11 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -478,6 +720,11 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -493,6 +740,15 @@ define void @shl_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ushl v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ushl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = shl <32 x i8> %op1, %op2
@@ -509,6 +765,11 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -522,6 +783,11 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -537,6 +803,15 @@ define void @shl_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ushl v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ushl v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = shl <16 x i16> %op1, %op2
@@ -553,6 +828,11 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -566,6 +846,11 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -581,6 +866,15 @@ define void @shl_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ushl v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ushl v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = shl <8 x i32> %op1, %op2
@@ -597,6 +891,11 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -610,6 +909,11 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -625,6 +929,15 @@ define void @shl_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ushl v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ushl v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = shl <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 27dbfc9a23a8de..42d3b9d8f71f86 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,6 +16,13 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -27,6 +35,22 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -42,6 +66,29 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -61,6 +108,13 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -74,6 +128,12 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -90,6 +150,20 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -114,6 +188,26 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -132,6 +226,13 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; CHECK-NEXT:    and w8, w8, #0xffff
 ; CHECK-NEXT:    ucvtf d0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w8, v0.h[0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <1 x i16> %op1 to <1 x double>
   ret <1 x double> %res
 }
@@ -146,6 +247,14 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -163,6 +272,21 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = uitofp <4 x i16> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -191,6 +315,30 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q3, [x1]
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -239,6 +387,46 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v7.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    ucvtf v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v7.2d
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v6.2d
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -258,6 +446,13 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -271,6 +466,12 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -288,6 +489,15 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x half>
   ret <8 x half> %res
@@ -312,6 +522,21 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i32_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = uitofp <16 x i32> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -330,6 +555,11 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -342,6 +572,11 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -355,6 +590,14 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -374,6 +617,12 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -390,6 +639,20 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = uitofp <4 x i32> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -414,6 +677,26 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -440,6 +723,18 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -460,6 +755,16 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x half>
   ret <4 x half> %res
@@ -493,6 +798,22 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn v2.2s, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.4s, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x half>
   ret <8 x half> %res
@@ -511,6 +832,12 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -528,6 +855,15 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x float>
   ret <4 x float> %res
@@ -552,6 +888,21 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v3.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -570,6 +921,11 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -583,6 +939,14 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -601,6 +965,13 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -613,6 +984,22 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -628,6 +1015,29 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -646,6 +1056,13 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -659,6 +1076,12 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -675,6 +1098,20 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -699,6 +1136,26 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -720,6 +1177,14 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -737,6 +1202,21 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = sitofp <4 x i16> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -765,6 +1245,30 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q3, [x1]
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -813,6 +1317,46 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v7.2d
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v6.2d
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -832,6 +1376,13 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -845,6 +1396,12 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -862,6 +1419,15 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x half>
   ret <8 x half> %res
@@ -879,6 +1445,11 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -891,6 +1462,11 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -904,6 +1480,14 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -923,6 +1507,12 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -939,6 +1529,20 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = sitofp <4 x i32> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -963,6 +1567,26 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -1007,6 +1631,40 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q2, q4, [x1, #96]
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v6.2d
+; NONEON-NOSVE-NEXT:    stp q3, q5, [x1, #64]
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v7.2d
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = sitofp <16 x i32> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -1033,6 +1691,18 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -1053,6 +1723,16 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x half>
   ret <4 x half> %res
@@ -1071,6 +1751,12 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -1088,6 +1774,15 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x float>
   ret <4 x float> %res
@@ -1105,6 +1800,11 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -1118,6 +1818,14 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -1130,6 +1838,13 @@ define half @scvtf_i16_f16(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i16_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to half
   ret half %3
@@ -1141,6 +1856,12 @@ define float @scvtf_i16_f32(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf s0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i16_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to float
   ret float %3
@@ -1152,6 +1873,12 @@ define double @scvtf_i16_f64(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i16_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to double
   ret double %3
@@ -1163,6 +1890,13 @@ define half @scvtf_i32_f16(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i32_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to half
   ret half %3
@@ -1174,6 +1908,12 @@ define float @scvtf_i32_f32(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf s0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i32_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to float
   ret float %3
@@ -1185,6 +1925,12 @@ define double @scvtf_i32_f64(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i32_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to double
   ret double %3
@@ -1196,6 +1942,13 @@ define half @scvtf_i64_f16(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf h0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i64_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to half
   ret half %3
@@ -1207,6 +1960,12 @@ define float @scvtf_i64_f32(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf s0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i64_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to float
   ret float %3
@@ -1218,6 +1977,12 @@ define double @scvtf_i64_f64(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i64_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to double
   ret double %3
@@ -1229,6 +1994,13 @@ define half @ucvtf_i16_f16(ptr %0) {
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ucvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i16_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to half
   ret half %3
@@ -1240,6 +2012,12 @@ define float @ucvtf_i16_f32(ptr %0) {
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ucvtf s0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i16_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to float
   ret float %3
@@ -1251,6 +2029,12 @@ define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i16_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to double
   ret double %3
@@ -1262,6 +2046,13 @@ define half @ucvtf_i32_f16(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ucvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i32_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to half
   ret half %3
@@ -1273,6 +2064,12 @@ define float @ucvtf_i32_f32(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ucvtf s0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i32_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to float
   ret float %3
@@ -1284,6 +2081,12 @@ define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i32_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to double
   ret double %3
@@ -1295,6 +2098,13 @@ define half @ucvtf_i64_f16(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf h0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i64_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to half
   ret half %3
@@ -1306,6 +2116,12 @@ define float @ucvtf_i64_f32(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf s0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i64_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to float
   ret float %3
@@ -1317,6 +2133,12 @@ define double @ucvtf_i64_f64(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i64_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to double
   ret double %3
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 3775a64a89a0cb..250929df6b3c35 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,13 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
 }
@@ -36,6 +44,13 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.8b, v2.8b, #7
+; NONEON-NOSVE-NEXT:    cmlt v2.8b, v2.8b, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
 }
@@ -54,6 +69,13 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.16b, v2.16b, #7
+; NONEON-NOSVE-NEXT:    cmlt v2.16b, v2.16b, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
 }
@@ -70,6 +92,18 @@ define void @select_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.b, p0, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    cmeq v4.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    cmeq v5.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %mask = icmp eq <32 x i8> %op1, %op2
@@ -92,6 +126,13 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
 }
@@ -110,6 +151,13 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
 }
@@ -129,6 +177,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
 }
@@ -145,6 +201,18 @@ define void @select_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    cmeq v4.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    cmeq v5.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %mask = icmp eq <16 x i16> %op1, %op2
@@ -167,6 +235,13 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
 }
@@ -186,6 +261,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
 }
@@ -202,6 +285,18 @@ define void @select_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    cmeq v4.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    cmeq v5.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %mask = icmp eq <8 x i32> %op1, %op2
@@ -223,6 +318,14 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    fmov d2, x8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
 }
@@ -242,6 +345,14 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
+; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
 }
@@ -258,6 +369,18 @@ define void @select_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    cmeq v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmeq v5.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %mask = icmp eq <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index 918f0ccc0cf6a0..42c439ca4b38d4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,19 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) {
 ; CHECK-NEXT:    stp q2, q5, [x0, #32]
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
+; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v5.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    dup v0.4s, v1.s[2]
+; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    add v3.4s, v4.4s, v4.4s
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
   %1 = load <16 x i32>, ptr %arg2, align 256
@@ -42,6 +56,19 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ; CHECK-NEXT:    stp q3, q4, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test2:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
+; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    dup v0.2s, v1.s[2]
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
+; NONEON-NOSVE-NEXT:    stp q2, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
   %1 = load <16 x i32>, ptr %arg2, align 256
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
index 8c69d5b0bb375d..992b667a2eafe1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,6 +12,13 @@ define <4 x i8> @load_v4i8(ptr %a) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i8>, ptr %a
   ret <4 x i8> %load
 }
@@ -20,6 +28,11 @@ define <8 x i8> @load_v8i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i8>, ptr %a
   ret <8 x i8> %load
 }
@@ -29,6 +42,11 @@ define <16 x i8> @load_v16i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x i8>, ptr %a
   ret <16 x i8> %load
 }
@@ -38,6 +56,11 @@ define <32 x i8> @load_v32i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <32 x i8>, ptr %a
   ret <32 x i8> %load
 }
@@ -49,6 +72,15 @@ define <2 x i16> @load_v2i16(ptr %a) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    add x8, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i16>, ptr %a
   ret <2 x i16> %load
 }
@@ -58,6 +90,11 @@ define <2 x half> @load_v2f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x half>, ptr %a
   ret <2 x half> %load
 }
@@ -67,6 +104,11 @@ define <4 x i16> @load_v4i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i16>, ptr %a
   ret <4 x i16> %load
 }
@@ -76,6 +118,11 @@ define <4 x half> @load_v4f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x half>, ptr %a
   ret <4 x half> %load
 }
@@ -85,6 +132,11 @@ define <8 x i16> @load_v8i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i16>, ptr %a
   ret <8 x i16> %load
 }
@@ -94,6 +146,11 @@ define <8 x half> @load_v8f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x half>, ptr %a
   ret <8 x half> %load
 }
@@ -103,6 +160,11 @@ define <16 x i16> @load_v16i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x i16>, ptr %a
   ret <16 x i16> %load
 }
@@ -112,6 +174,11 @@ define <16 x half> @load_v16f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x half>, ptr %a
   ret <16 x half> %load
 }
@@ -121,6 +188,11 @@ define <2 x i32> @load_v2i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i32>, ptr %a
   ret <2 x i32> %load
 }
@@ -130,6 +202,11 @@ define <2 x float> @load_v2f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x float>, ptr %a
   ret <2 x float> %load
 }
@@ -139,6 +216,11 @@ define <4 x i32> @load_v4i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i32>, ptr %a
   ret <4 x i32> %load
 }
@@ -148,6 +230,11 @@ define <4 x float> @load_v4f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x float>, ptr %a
   ret <4 x float> %load
 }
@@ -157,6 +244,11 @@ define <8 x i32> @load_v8i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i32>, ptr %a
   ret <8 x i32> %load
 }
@@ -166,6 +258,11 @@ define <8 x float> @load_v8f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x float>, ptr %a
   ret <8 x float> %load
 }
@@ -175,6 +272,11 @@ define <1 x i64> @load_v1i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <1 x i64>, ptr %a
   ret <1 x i64> %load
 }
@@ -184,6 +286,11 @@ define <1 x double> @load_v1f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <1 x double>, ptr %a
   ret <1 x double> %load
 }
@@ -193,6 +300,11 @@ define <2 x i64> @load_v2i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i64>, ptr %a
   ret <2 x i64> %load
 }
@@ -202,6 +314,11 @@ define <2 x double> @load_v2f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x double>, ptr %a
   ret <2 x double> %load
 }
@@ -211,6 +328,11 @@ define <4 x i64> @load_v4i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i64>, ptr %a
   ret <4 x i64> %load
 }
@@ -220,6 +342,11 @@ define <4 x double> @load_v4f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x double>, ptr %a
   ret <4 x double> %load
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index ef52eadc5d3b09..7abe73f08dfd65 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -17,6 +18,14 @@ define i8 @andv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -29,6 +38,15 @@ define i8 @andv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -41,6 +59,20 @@ define i8 @andv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -54,6 +86,22 @@ define i8 @andv_v32i8(ptr %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -67,6 +115,13 @@ define i16 @andv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -79,6 +134,14 @@ define i16 @andv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -91,6 +154,19 @@ define i16 @andv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -104,6 +180,21 @@ define i16 @andv_v16i16(ptr %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -117,6 +208,13 @@ define i32 @andv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -129,6 +227,18 @@ define i32 @andv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -142,6 +252,20 @@ define i32 @andv_v8i32(ptr %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -155,6 +279,16 @@ define i64 @andv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -168,6 +302,18 @@ define i64 @andv_v4i64(ptr %a) {
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -185,6 +331,14 @@ define i8 @eorv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -197,6 +351,15 @@ define i8 @eorv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -209,6 +372,20 @@ define i8 @eorv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -222,6 +399,22 @@ define i8 @eorv_v32i8(ptr %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -235,6 +428,13 @@ define i16 @eorv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -247,6 +447,14 @@ define i16 @eorv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -259,6 +467,19 @@ define i16 @eorv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -272,6 +493,21 @@ define i16 @eorv_v16i16(ptr %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -285,6 +521,13 @@ define i32 @eorv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -297,6 +540,18 @@ define i32 @eorv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -310,6 +565,20 @@ define i32 @eorv_v8i32(ptr %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -323,6 +592,16 @@ define i64 @eorv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -336,6 +615,18 @@ define i64 @eorv_v4i64(ptr %a) {
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -353,6 +644,14 @@ define i8 @orv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -365,6 +664,15 @@ define i8 @orv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -377,6 +685,20 @@ define i8 @orv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -390,6 +712,22 @@ define i8 @orv_v32i8(ptr %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -403,6 +741,13 @@ define i16 @orv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -415,6 +760,14 @@ define i16 @orv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -427,6 +780,19 @@ define i16 @orv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -440,6 +806,21 @@ define i16 @orv_v16i16(ptr %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -453,6 +834,13 @@ define i32 @orv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -465,6 +853,18 @@ define i32 @orv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -478,6 +878,20 @@ define i32 @orv_v8i32(ptr %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -491,6 +905,16 @@ define i64 @orv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -504,6 +928,18 @@ define i64 @orv_v4i64(ptr %a) {
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
   ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 4f8f8c2e4b244a..6c33613f8e757d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,6 +20,44 @@ define <4 x i8> @masked_load_v4i8(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB0_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[0], [x0]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_3
+; NONEON-NOSVE-NEXT:    b .LBB0_4
+; NONEON-NOSVE-NEXT:  .LBB0_2:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_4
+; NONEON-NOSVE-NEXT:  .LBB0_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #1
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:  .LBB0_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
+; NONEON-NOSVE-NEXT:  .LBB0_6: // %else8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_6
+; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x8, x0, #3
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x i8> @llvm.masked.load.v4i8(ptr %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer)
   ret <4 x i8> %load
 }
@@ -34,6 +73,67 @@ define <8 x i8> @masked_load_v8i8(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB1_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr b0, [x0]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_3
+; NONEON-NOSVE-NEXT:    b .LBB1_4
+; NONEON-NOSVE-NEXT:  .LBB1_2:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_4
+; NONEON-NOSVE-NEXT:  .LBB1_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #1
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_11
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_12
+; NONEON-NOSVE-NEXT:  .LBB1_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_13
+; NONEON-NOSVE-NEXT:  .LBB1_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_14
+; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_15
+; NONEON-NOSVE-NEXT:  .LBB1_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
+; NONEON-NOSVE-NEXT:  .LBB1_10: // %else20
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB1_11: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_6
+; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #3
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_7
+; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_8
+; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #5
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_9
+; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_10
+; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x8, x0, #7
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer)
   ret <8 x i8> %load
 }
@@ -49,6 +149,115 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    addv h1, v0.8h
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
+; NONEON-NOSVE-NEXT:  .LBB2_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_19
+; NONEON-NOSVE-NEXT:  .LBB2_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_20
+; NONEON-NOSVE-NEXT:  .LBB2_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_21
+; NONEON-NOSVE-NEXT:  .LBB2_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_22
+; NONEON-NOSVE-NEXT:  .LBB2_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_23
+; NONEON-NOSVE-NEXT:  .LBB2_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_24
+; NONEON-NOSVE-NEXT:  .LBB2_8: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_25
+; NONEON-NOSVE-NEXT:  .LBB2_9: // %else23
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_26
+; NONEON-NOSVE-NEXT:  .LBB2_10: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_27
+; NONEON-NOSVE-NEXT:  .LBB2_11: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_28
+; NONEON-NOSVE-NEXT:  .LBB2_12: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_29
+; NONEON-NOSVE-NEXT:  .LBB2_13: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_30
+; NONEON-NOSVE-NEXT:  .LBB2_14: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_31
+; NONEON-NOSVE-NEXT:  .LBB2_15: // %else41
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
+; NONEON-NOSVE-NEXT:  .LBB2_16: // %else44
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr b0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_2
+; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #1
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB2_3
+; NONEON-NOSVE-NEXT:  .LBB2_19: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #3
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_5
+; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_6
+; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #5
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_7
+; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_8
+; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x9, x0, #7
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_9
+; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.load22
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_10
+; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.load25
+; NONEON-NOSVE-NEXT:    add x9, x0, #9
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_11
+; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.load28
+; NONEON-NOSVE-NEXT:    add x9, x0, #10
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_12
+; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.load31
+; NONEON-NOSVE-NEXT:    add x9, x0, #11
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_13
+; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.load34
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_14
+; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.load37
+; NONEON-NOSVE-NEXT:    add x9, x0, #13
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_15
+; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.load40
+; NONEON-NOSVE-NEXT:    add x9, x0, #14
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
+; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.load43
+; NONEON-NOSVE-NEXT:    add x8, x0, #15
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer)
   ret <16 x i8> %load
 }
@@ -130,6 +339,277 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fmov s1, w1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
+; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
+; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
+; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
+; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
+; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
+; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
+; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    addv h1, v1.8h
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
+; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
+; NONEON-NOSVE-NEXT:  .LBB3_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
+; NONEON-NOSVE-NEXT:  .LBB3_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
+; NONEON-NOSVE-NEXT:  .LBB3_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
+; NONEON-NOSVE-NEXT:  .LBB3_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
+; NONEON-NOSVE-NEXT:  .LBB3_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
+; NONEON-NOSVE-NEXT:  .LBB3_8: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
+; NONEON-NOSVE-NEXT:  .LBB3_9: // %else23
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
+; NONEON-NOSVE-NEXT:  .LBB3_10: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
+; NONEON-NOSVE-NEXT:  .LBB3_11: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
+; NONEON-NOSVE-NEXT:  .LBB3_12: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
+; NONEON-NOSVE-NEXT:  .LBB3_13: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
+; NONEON-NOSVE-NEXT:  .LBB3_14: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
+; NONEON-NOSVE-NEXT:  .LBB3_15: // %else41
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
+; NONEON-NOSVE-NEXT:  .LBB3_16: // %else44
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
+; NONEON-NOSVE-NEXT:  .LBB3_17: // %else47
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
+; NONEON-NOSVE-NEXT:  .LBB3_18: // %else50
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
+; NONEON-NOSVE-NEXT:  .LBB3_19: // %else53
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
+; NONEON-NOSVE-NEXT:  .LBB3_20: // %else56
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
+; NONEON-NOSVE-NEXT:  .LBB3_21: // %else59
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
+; NONEON-NOSVE-NEXT:  .LBB3_22: // %else62
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
+; NONEON-NOSVE-NEXT:  .LBB3_23: // %else65
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
+; NONEON-NOSVE-NEXT:  .LBB3_24: // %else68
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
+; NONEON-NOSVE-NEXT:  .LBB3_25: // %else71
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
+; NONEON-NOSVE-NEXT:  .LBB3_26: // %else74
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
+; NONEON-NOSVE-NEXT:  .LBB3_27: // %else77
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
+; NONEON-NOSVE-NEXT:  .LBB3_28: // %else80
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
+; NONEON-NOSVE-NEXT:  .LBB3_29: // %else83
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
+; NONEON-NOSVE-NEXT:  .LBB3_30: // %else86
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
+; NONEON-NOSVE-NEXT:  .LBB3_31: // %else89
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %else92
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr b0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #1
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #3
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #5
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x9, x0, #7
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.load22
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.load25
+; NONEON-NOSVE-NEXT:    add x9, x0, #9
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.load28
+; NONEON-NOSVE-NEXT:    add x9, x0, #10
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.load31
+; NONEON-NOSVE-NEXT:    add x9, x0, #11
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.load34
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.load37
+; NONEON-NOSVE-NEXT:    add x9, x0, #13
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.load40
+; NONEON-NOSVE-NEXT:    add x9, x0, #14
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.load43
+; NONEON-NOSVE-NEXT:    add x9, x0, #15
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.load46
+; NONEON-NOSVE-NEXT:    add x9, x0, #16
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[0], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.load49
+; NONEON-NOSVE-NEXT:    add x9, x0, #17
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.load52
+; NONEON-NOSVE-NEXT:    add x9, x0, #18
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.load55
+; NONEON-NOSVE-NEXT:    add x9, x0, #19
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.load58
+; NONEON-NOSVE-NEXT:    add x9, x0, #20
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.load61
+; NONEON-NOSVE-NEXT:    add x9, x0, #21
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.load64
+; NONEON-NOSVE-NEXT:    add x9, x0, #22
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.load67
+; NONEON-NOSVE-NEXT:    add x9, x0, #23
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[7], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.load70
+; NONEON-NOSVE-NEXT:    add x9, x0, #24
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[8], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.load73
+; NONEON-NOSVE-NEXT:    add x9, x0, #25
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[9], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.load76
+; NONEON-NOSVE-NEXT:    add x9, x0, #26
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[10], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.load79
+; NONEON-NOSVE-NEXT:    add x9, x0, #27
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[11], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.load82
+; NONEON-NOSVE-NEXT:    add x9, x0, #28
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[12], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.load85
+; NONEON-NOSVE-NEXT:    add x9, x0, #29
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[13], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.load88
+; NONEON-NOSVE-NEXT:    add x9, x0, #30
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[14], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.load91
+; NONEON-NOSVE-NEXT:    add x8, x0, #31
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[15], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer)
   ret <32 x i8> %load
 }
@@ -155,6 +635,31 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_2
+; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x8, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x half> @llvm.masked.load.v2f16(ptr %src, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
   ret <2 x half> %load
 }
@@ -170,6 +675,43 @@ define <4 x half> @masked_load_v4f16(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h1, v0.4h
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
+; NONEON-NOSVE-NEXT:  .LBB5_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
+; NONEON-NOSVE-NEXT:  .LBB5_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
+; NONEON-NOSVE-NEXT:  .LBB5_4: // %else8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_2
+; NONEON-NOSVE-NEXT:  .LBB5_6: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB5_3
+; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x8, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x half> @llvm.masked.load.v4f16(ptr %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer)
   ret <4 x half> %load
 }
@@ -186,6 +728,65 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b1, v0.8b
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
+; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_11
+; NONEON-NOSVE-NEXT:  .LBB6_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_12
+; NONEON-NOSVE-NEXT:  .LBB6_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_13
+; NONEON-NOSVE-NEXT:  .LBB6_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_14
+; NONEON-NOSVE-NEXT:  .LBB6_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_15
+; NONEON-NOSVE-NEXT:  .LBB6_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
+; NONEON-NOSVE-NEXT:  .LBB6_8: // %else20
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_2
+; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB6_3
+; NONEON-NOSVE-NEXT:  .LBB6_11: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_5
+; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_6
+; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #10
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_7
+; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_8
+; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x8, x0, #14
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer)
   ret <8 x half> %load
 }
@@ -210,6 +811,116 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    addv h2, v0.8h
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s2
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
+; NONEON-NOSVE-NEXT:  .LBB7_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_19
+; NONEON-NOSVE-NEXT:  .LBB7_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_20
+; NONEON-NOSVE-NEXT:  .LBB7_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_21
+; NONEON-NOSVE-NEXT:  .LBB7_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_22
+; NONEON-NOSVE-NEXT:  .LBB7_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_23
+; NONEON-NOSVE-NEXT:  .LBB7_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_24
+; NONEON-NOSVE-NEXT:  .LBB7_8: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_25
+; NONEON-NOSVE-NEXT:  .LBB7_9: // %else23
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_26
+; NONEON-NOSVE-NEXT:  .LBB7_10: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_27
+; NONEON-NOSVE-NEXT:  .LBB7_11: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_28
+; NONEON-NOSVE-NEXT:  .LBB7_12: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_29
+; NONEON-NOSVE-NEXT:  .LBB7_13: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_30
+; NONEON-NOSVE-NEXT:  .LBB7_14: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_31
+; NONEON-NOSVE-NEXT:  .LBB7_15: // %else41
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
+; NONEON-NOSVE-NEXT:  .LBB7_16: // %else44
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_2
+; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB7_3
+; NONEON-NOSVE-NEXT:  .LBB7_19: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_5
+; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_6
+; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #10
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_7
+; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_8
+; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x9, x0, #14
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_9
+; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.load22
+; NONEON-NOSVE-NEXT:    add x9, x0, #16
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[0], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_10
+; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.load25
+; NONEON-NOSVE-NEXT:    add x9, x0, #18
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_11
+; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.load28
+; NONEON-NOSVE-NEXT:    add x9, x0, #20
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_12
+; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.load31
+; NONEON-NOSVE-NEXT:    add x9, x0, #22
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_13
+; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.load34
+; NONEON-NOSVE-NEXT:    add x9, x0, #24
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_14
+; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.load37
+; NONEON-NOSVE-NEXT:    add x9, x0, #26
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_15
+; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.load40
+; NONEON-NOSVE-NEXT:    add x9, x0, #28
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_16
+; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.load43
+; NONEON-NOSVE-NEXT:    add x8, x0, #30
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer)
   ret <16 x half> %load
 }
@@ -225,6 +936,31 @@ define <2 x float> @masked_load_v2f32(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_2: // %else2
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB8_3: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_2
+; NONEON-NOSVE-NEXT:  .LBB8_4: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x8, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x float> @llvm.masked.load.v2f32(ptr %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
   ret <2 x float> %load
 }
@@ -241,6 +977,41 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h1, v0.4h
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_6
+; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_7
+; NONEON-NOSVE-NEXT:  .LBB9_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_8
+; NONEON-NOSVE-NEXT:  .LBB9_4: // %else8
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB9_5: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_2
+; NONEON-NOSVE-NEXT:  .LBB9_6: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB9_3
+; NONEON-NOSVE-NEXT:  .LBB9_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x8, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
   ret <4 x float> %load
 }
@@ -290,6 +1061,66 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    addv b2, v0.8b
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s2
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_10
+; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB10_11
+; NONEON-NOSVE-NEXT:  .LBB10_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB10_12
+; NONEON-NOSVE-NEXT:  .LBB10_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB10_13
+; NONEON-NOSVE-NEXT:  .LBB10_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB10_14
+; NONEON-NOSVE-NEXT:  .LBB10_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB10_15
+; NONEON-NOSVE-NEXT:  .LBB10_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB10_16
+; NONEON-NOSVE-NEXT:  .LBB10_8: // %else20
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB10_9: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
+; NONEON-NOSVE-NEXT:  .LBB10_10: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB10_3
+; NONEON-NOSVE-NEXT:  .LBB10_11: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_12: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB10_5
+; NONEON-NOSVE-NEXT:  .LBB10_13: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #16
+; NONEON-NOSVE-NEXT:    ld1 { v1.s }[0], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB10_6
+; NONEON-NOSVE-NEXT:  .LBB10_14: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #20
+; NONEON-NOSVE-NEXT:    ld1 { v1.s }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB10_7
+; NONEON-NOSVE-NEXT:  .LBB10_15: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #24
+; NONEON-NOSVE-NEXT:    ld1 { v1.s }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB10_8
+; NONEON-NOSVE-NEXT:  .LBB10_16: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x8, x0, #28
+; NONEON-NOSVE-NEXT:    ld1 { v1.s }[3], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <8 x float> @llvm.masked.load.v8f32(ptr %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
   ret <8 x float> %load
 }
@@ -306,6 +1137,29 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_2: // %else2
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB11_3: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_2
+; NONEON-NOSVE-NEXT:  .LBB11_4: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x8, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer)
   ret <2 x double> %load
 }
@@ -331,6 +1185,42 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI12_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI12_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    addv h2, v0.4h
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s2
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB12_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB12_6
+; NONEON-NOSVE-NEXT:  .LBB12_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB12_7
+; NONEON-NOSVE-NEXT:  .LBB12_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB12_8
+; NONEON-NOSVE-NEXT:  .LBB12_4: // %else8
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB12_5: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB12_2
+; NONEON-NOSVE-NEXT:  .LBB12_6: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB12_3
+; NONEON-NOSVE-NEXT:  .LBB12_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #16
+; NONEON-NOSVE-NEXT:    ld1 { v1.d }[0], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB12_4
+; NONEON-NOSVE-NEXT:  .LBB12_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x8, x0, #24
+; NONEON-NOSVE-NEXT:    ld1 { v1.d }[1], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer)
   ret <4 x double> %load
 }
@@ -356,6 +1246,38 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
+; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB13_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB13_3
+; NONEON-NOSVE-NEXT:    b .LBB13_4
+; NONEON-NOSVE-NEXT:  .LBB13_2:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB13_4
+; NONEON-NOSVE-NEXT:  .LBB13_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
+; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:  .LBB13_4: // %else2
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB13_6
+; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
+; NONEON-NOSVE-NEXT:    add x8, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:  .LBB13_6: // %else5
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = zext <3 x i16> %load_value to <3 x i32>
   ret <3 x i32> %extend;
@@ -382,6 +1304,38 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
+; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB14_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB14_3
+; NONEON-NOSVE-NEXT:    b .LBB14_4
+; NONEON-NOSVE-NEXT:  .LBB14_2:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB14_4
+; NONEON-NOSVE-NEXT:  .LBB14_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
+; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:  .LBB14_4: // %else2
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB14_6
+; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
+; NONEON-NOSVE-NEXT:    add x8, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:  .LBB14_6: // %else5
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = sext <3 x i16> %load_value to <3 x i32>
   ret <3 x i32> %extend;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index bd6b96889b4cc5..0904399558aee1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,6 +20,37 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB0_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_6
+; NONEON-NOSVE-NEXT:  .LBB0_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
+; NONEON-NOSVE-NEXT:  .LBB0_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
+; NONEON-NOSVE-NEXT:  .LBB0_4: // %else6
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB0_5: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_2
+; NONEON-NOSVE-NEXT:  .LBB0_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB0_3
+; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_4
+; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -34,6 +66,57 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB1_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_10
+; NONEON-NOSVE-NEXT:  .LBB1_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_11
+; NONEON-NOSVE-NEXT:  .LBB1_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_12
+; NONEON-NOSVE-NEXT:  .LBB1_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_13
+; NONEON-NOSVE-NEXT:  .LBB1_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_14
+; NONEON-NOSVE-NEXT:  .LBB1_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_15
+; NONEON-NOSVE-NEXT:  .LBB1_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
+; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB1_9: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_2
+; NONEON-NOSVE-NEXT:  .LBB1_10: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB1_3
+; NONEON-NOSVE-NEXT:  .LBB1_11: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_4
+; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_5
+; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.store7
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_6
+; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.store9
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_7
+; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.store11
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_8
+; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.store13
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -49,6 +132,99 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
+; NONEON-NOSVE-NEXT:  .LBB2_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_19
+; NONEON-NOSVE-NEXT:  .LBB2_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_20
+; NONEON-NOSVE-NEXT:  .LBB2_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_21
+; NONEON-NOSVE-NEXT:  .LBB2_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_22
+; NONEON-NOSVE-NEXT:  .LBB2_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_23
+; NONEON-NOSVE-NEXT:  .LBB2_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_24
+; NONEON-NOSVE-NEXT:  .LBB2_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_25
+; NONEON-NOSVE-NEXT:  .LBB2_9: // %else16
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_26
+; NONEON-NOSVE-NEXT:  .LBB2_10: // %else18
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_27
+; NONEON-NOSVE-NEXT:  .LBB2_11: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_28
+; NONEON-NOSVE-NEXT:  .LBB2_12: // %else22
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_29
+; NONEON-NOSVE-NEXT:  .LBB2_13: // %else24
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_30
+; NONEON-NOSVE-NEXT:  .LBB2_14: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_31
+; NONEON-NOSVE-NEXT:  .LBB2_15: // %else28
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
+; NONEON-NOSVE-NEXT:  .LBB2_16: // %else30
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_2
+; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB2_3
+; NONEON-NOSVE-NEXT:  .LBB2_19: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_5
+; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.store7
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_6
+; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.store9
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_7
+; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.store11
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_8
+; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.store13
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_9
+; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.store15
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_10
+; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.store17
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_11
+; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.store19
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_12
+; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.store21
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_13
+; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.store23
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_14
+; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.store25
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_15
+; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.store27
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
+; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.store29
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
 }
@@ -129,6 +305,244 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fmov s1, w1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
+; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
+; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
+; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
+; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
+; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
+; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
+; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    addv h1, v1.8h
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
+; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
+; NONEON-NOSVE-NEXT:  .LBB3_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
+; NONEON-NOSVE-NEXT:  .LBB3_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
+; NONEON-NOSVE-NEXT:  .LBB3_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
+; NONEON-NOSVE-NEXT:  .LBB3_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
+; NONEON-NOSVE-NEXT:  .LBB3_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
+; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
+; NONEON-NOSVE-NEXT:  .LBB3_9: // %else16
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
+; NONEON-NOSVE-NEXT:  .LBB3_10: // %else18
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
+; NONEON-NOSVE-NEXT:  .LBB3_11: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
+; NONEON-NOSVE-NEXT:  .LBB3_12: // %else22
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
+; NONEON-NOSVE-NEXT:  .LBB3_13: // %else24
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
+; NONEON-NOSVE-NEXT:  .LBB3_14: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
+; NONEON-NOSVE-NEXT:  .LBB3_15: // %else28
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
+; NONEON-NOSVE-NEXT:  .LBB3_16: // %else30
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
+; NONEON-NOSVE-NEXT:  .LBB3_17: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
+; NONEON-NOSVE-NEXT:  .LBB3_18: // %else34
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
+; NONEON-NOSVE-NEXT:  .LBB3_19: // %else36
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
+; NONEON-NOSVE-NEXT:  .LBB3_20: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
+; NONEON-NOSVE-NEXT:  .LBB3_21: // %else40
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
+; NONEON-NOSVE-NEXT:  .LBB3_22: // %else42
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
+; NONEON-NOSVE-NEXT:  .LBB3_23: // %else44
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
+; NONEON-NOSVE-NEXT:  .LBB3_24: // %else46
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
+; NONEON-NOSVE-NEXT:  .LBB3_25: // %else48
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
+; NONEON-NOSVE-NEXT:  .LBB3_26: // %else50
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
+; NONEON-NOSVE-NEXT:  .LBB3_27: // %else52
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
+; NONEON-NOSVE-NEXT:  .LBB3_28: // %else54
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
+; NONEON-NOSVE-NEXT:  .LBB3_29: // %else56
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
+; NONEON-NOSVE-NEXT:  .LBB3_30: // %else58
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
+; NONEON-NOSVE-NEXT:  .LBB3_31: // %else60
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %else62
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store7
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store9
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store11
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store13
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store15
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store17
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store19
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store21
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store23
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store25
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store27
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store29
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
+; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store31
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store33
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #17]
+; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store35
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #18]
+; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store37
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #19]
+; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store39
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #20]
+; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store41
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #21]
+; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store43
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #22]
+; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store45
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #23]
+; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store47
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store49
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #25]
+; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store51
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #26]
+; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store53
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #27]
+; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store55
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #28]
+; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store57
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #29]
+; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store59
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #30]
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store61
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
   ret void
 }
@@ -154,6 +568,29 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_2
+; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f16(<2 x half> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
 }
@@ -169,6 +606,41 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
+; NONEON-NOSVE-NEXT:  .LBB5_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
+; NONEON-NOSVE-NEXT:  .LBB5_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
+; NONEON-NOSVE-NEXT:  .LBB5_4: // %else6
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_2
+; NONEON-NOSVE-NEXT:  .LBB5_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB5_3
+; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -185,6 +657,65 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
+; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_11
+; NONEON-NOSVE-NEXT:  .LBB6_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_12
+; NONEON-NOSVE-NEXT:  .LBB6_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_13
+; NONEON-NOSVE-NEXT:  .LBB6_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_14
+; NONEON-NOSVE-NEXT:  .LBB6_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_15
+; NONEON-NOSVE-NEXT:  .LBB6_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
+; NONEON-NOSVE-NEXT:  .LBB6_8: // %else14
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_2
+; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB6_3
+; NONEON-NOSVE-NEXT:  .LBB6_11: // %cond.store3
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.store5
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_5
+; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.store7
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_6
+; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.store9
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_7
+; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.store11
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_8
+; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.store13
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -209,6 +740,115 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; CHECK-NEXT:    st1h { z1.h }, p1, [x0, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
+; NONEON-NOSVE-NEXT:  .LBB7_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_19
+; NONEON-NOSVE-NEXT:  .LBB7_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_20
+; NONEON-NOSVE-NEXT:  .LBB7_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_21
+; NONEON-NOSVE-NEXT:  .LBB7_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_22
+; NONEON-NOSVE-NEXT:  .LBB7_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_23
+; NONEON-NOSVE-NEXT:  .LBB7_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_24
+; NONEON-NOSVE-NEXT:  .LBB7_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_25
+; NONEON-NOSVE-NEXT:  .LBB7_9: // %else16
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_26
+; NONEON-NOSVE-NEXT:  .LBB7_10: // %else18
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_27
+; NONEON-NOSVE-NEXT:  .LBB7_11: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_28
+; NONEON-NOSVE-NEXT:  .LBB7_12: // %else22
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_29
+; NONEON-NOSVE-NEXT:  .LBB7_13: // %else24
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_30
+; NONEON-NOSVE-NEXT:  .LBB7_14: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_31
+; NONEON-NOSVE-NEXT:  .LBB7_15: // %else28
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
+; NONEON-NOSVE-NEXT:  .LBB7_16: // %else30
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_2
+; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB7_3
+; NONEON-NOSVE-NEXT:  .LBB7_19: // %cond.store3
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.store5
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_5
+; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.store7
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_6
+; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.store9
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_7
+; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.store11
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_8
+; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.store13
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_9
+; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.store15
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_10
+; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.store17
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #18]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_11
+; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.store19
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #20]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_12
+; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.store21
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #22]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_13
+; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.store23
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #24]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_14
+; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.store25
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #26]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_15
+; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.store27
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #28]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_16
+; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.store29
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #30]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
 }
@@ -225,6 +865,37 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_6
+; NONEON-NOSVE-NEXT:  .LBB8_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB8_7
+; NONEON-NOSVE-NEXT:  .LBB8_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB8_8
+; NONEON-NOSVE-NEXT:  .LBB8_4: // %else6
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB8_5: // %cond.store
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_2
+; NONEON-NOSVE-NEXT:  .LBB8_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB8_3
+; NONEON-NOSVE-NEXT:  .LBB8_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -275,6 +946,57 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_10
+; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_11
+; NONEON-NOSVE-NEXT:  .LBB9_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_12
+; NONEON-NOSVE-NEXT:  .LBB9_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB9_13
+; NONEON-NOSVE-NEXT:  .LBB9_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB9_14
+; NONEON-NOSVE-NEXT:  .LBB9_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB9_15
+; NONEON-NOSVE-NEXT:  .LBB9_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB9_16
+; NONEON-NOSVE-NEXT:  .LBB9_8: // %else14
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB9_9: // %cond.store
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_2
+; NONEON-NOSVE-NEXT:  .LBB9_10: // %cond.store1
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB9_3
+; NONEON-NOSVE-NEXT:  .LBB9_11: // %cond.store3
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_12: // %cond.store5
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB9_5
+; NONEON-NOSVE-NEXT:  .LBB9_13: // %cond.store7
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB9_6
+; NONEON-NOSVE-NEXT:  .LBB9_14: // %cond.store9
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #20]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB9_7
+; NONEON-NOSVE-NEXT:  .LBB9_15: // %cond.store11
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB9_8
+; NONEON-NOSVE-NEXT:  .LBB9_16: // %cond.store13
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #28]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -291,6 +1013,27 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB10_3: // %cond.store
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
+; NONEON-NOSVE-NEXT:  .LBB10_4: // %cond.store1
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
 }
@@ -315,6 +1058,37 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    st1d { z0.d }, p1, [x0, x8, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_6
+; NONEON-NOSVE-NEXT:  .LBB11_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB11_7
+; NONEON-NOSVE-NEXT:  .LBB11_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB11_8
+; NONEON-NOSVE-NEXT:  .LBB11_4: // %else6
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB11_5: // %cond.store
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_2
+; NONEON-NOSVE-NEXT:  .LBB11_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB11_3
+; NONEON-NOSVE-NEXT:  .LBB11_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index aef446a90df656..6a6b47e815ac16 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,15 @@ define void @add_v4i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ldr s1, [x1]
+; NONEON-NOSVE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i8>, ptr %a
   %op2 = load <4 x i8>, ptr %b
   %res = add <4 x i8> %op1, %op2
@@ -29,6 +39,14 @@ define void @add_v8i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
   %res = add <8 x i8> %op1, %op2
@@ -44,6 +62,14 @@ define void @add_v16i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
   %res = add <16 x i8> %op1, %op2
@@ -60,6 +86,15 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = add <32 x i8> %op1, %op2
@@ -76,6 +111,23 @@ define void @add_v2i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    ldrh w9, [x1]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    fmov s1, w9
+; NONEON-NOSVE-NEXT:    add x8, x0, #2
+; NONEON-NOSVE-NEXT:    add x9, x1, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    mov w8, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    strh w9, [x0]
+; NONEON-NOSVE-NEXT:    strh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i16>, ptr %a
   %op2 = load <2 x i16>, ptr %b
   %res = add <2 x i16> %op1, %op2
@@ -91,6 +143,14 @@ define void @add_v4i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %op2 = load <4 x i16>, ptr %b
   %res = add <4 x i16> %op1, %op2
@@ -106,6 +166,14 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %res = add <8 x i16> %op1, %op2
@@ -122,6 +190,15 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = add <16 x i16> %op1, %op2
@@ -137,6 +214,13 @@ define void @abs_v2i32(ptr %a) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   store <2 x i32> %res, ptr %a
@@ -151,6 +235,13 @@ define void @abs_v4i32(ptr %a) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   store <4 x i32> %res, ptr %a
@@ -166,6 +257,14 @@ define void @abs_v8i32(ptr %a) {
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
   store <8 x i32> %res, ptr %a
@@ -180,6 +279,13 @@ define void @abs_v2i64(ptr %a) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   store <2 x i64> %res, ptr %a
@@ -195,6 +301,14 @@ define void @abs_v4i64(ptr %a) {
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
   store <4 x i64> %res, ptr %a
@@ -211,6 +325,17 @@ define void @fadd_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ldr s1, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %op2 = load <2 x half>, ptr %b
   %res = fadd <2 x half> %op1, %op2
@@ -227,6 +352,17 @@ define void @fadd_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %op2 = load <4 x half>, ptr %b
   %res = fadd <4 x half> %op1, %op2
@@ -243,6 +379,21 @@ define void @fadd_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
   %res = fadd <8 x half> %op1, %op2
@@ -261,6 +412,29 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fadd <16 x half> %op1, %op2
@@ -277,6 +451,14 @@ define void @fadd_v2f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %op2 = load <2 x float>, ptr %b
   %res = fadd <2 x float> %op1, %op2
@@ -293,6 +475,14 @@ define void @fadd_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
   %res = fadd <4 x float> %op1, %op2
@@ -311,6 +501,15 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fadd <8 x float> %op1, %op2
@@ -327,6 +526,14 @@ define void @fadd_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
   %res = fadd <2 x double> %op1, %op2
@@ -345,6 +552,15 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fadd <4 x double> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index 6d91253caae58f..03bb899c517b4e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -15,6 +16,14 @@ define void @test_revbv16i16(ptr %a) {
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revbv16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 undef, i32 24, i32 27, i32 undef, i32 29, i32 28, i32 undef, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -31,6 +40,14 @@ define void @test_revbv8i32(ptr %a) {
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revbv8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -47,6 +64,14 @@ define void @test_revbv4i64(ptr %a) {
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revbv4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 31, i32 30, i32 29, i32 undef, i32 27, i32 undef, i32 undef, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -63,6 +88,14 @@ define void @test_revhv8i32(ptr %a) {
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   store <16 x i16> %tmp2, ptr %a
@@ -79,6 +112,14 @@ define void @test_revhv8f32(ptr %a) {
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x half>, ptr %a
   %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   store <16 x half> %tmp2, ptr %a
@@ -95,6 +136,14 @@ define void @test_revhv4i64(ptr %a) {
 ; CHECK-NEXT:    revh z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   store <16 x i16> %tmp2, ptr %a
@@ -111,6 +160,14 @@ define void @test_revwv4i64(ptr %a) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revwv4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   store <8 x i32> %tmp2, ptr %a
@@ -127,6 +184,14 @@ define void @test_revwv4f64(ptr %a) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revwv4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   store <8 x float> %tmp2, ptr %a
@@ -141,6 +206,12 @@ define <16 x i8> @test_revv16i8(ptr %a) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revv16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %a
   %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
   ret <16 x i8> %tmp2
@@ -156,6 +227,14 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revwv8i32v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -176,6 +255,18 @@ define void @test_revhv32i16(ptr %a) {
 ; CHECK-NEXT:    stp q0, q1, [x0, #32]
 ; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    rev64 v2.8h, v2.8h
+; NONEON-NOSVE-NEXT:    rev64 v3.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
   store <32 x i16> %tmp2, ptr %a
@@ -191,6 +282,14 @@ define void @test_rev_elts_fail(ptr %a) {
 ; CHECK-NEXT:    tbl z0.d, { z2.d }, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_rev_elts_fail:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x i64> %tmp2, ptr %a
@@ -208,6 +307,15 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 {
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revdv4i64_sve2p1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ptrue p0.d, vl2
+; NONEON-NOSVE-NEXT:    revd z0.q, p0/m, z0.q
+; NONEON-NOSVE-NEXT:    revd z1.q, p0/m, z1.q
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x i64> %tmp2, ptr %a
@@ -223,6 +331,15 @@ define void @test_revdv4f64_sve2p1(ptr %a) #1 {
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revdv4f64_sve2p1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ptrue p0.d
+; NONEON-NOSVE-NEXT:    revd z0.q, p0/m, z0.q
+; NONEON-NOSVE-NEXT:    revd z1.q, p0/m, z1.q
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x double> %tmp2, ptr %a
@@ -238,6 +355,16 @@ define void @test_revv8i32(ptr %a) {
 ; CHECK-NEXT:    tbl z0.s, { z2.s }, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revv8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   store <8 x i32> %tmp2, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index 8808ad9a23d7c5..f254a1f9098f2d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -68,6 +69,18 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
@@ -196,6 +209,28 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q4, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q5, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q6, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    ldp q7, q3, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v17.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    zip2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    zip1 v16.8h, v1.8h, v3.8h
+; NONEON-NOSVE-NEXT:    zip2 v1.8h, v1.8h, v3.8h
+; NONEON-NOSVE-NEXT:    zip1 v2.8h, v5.8h, v7.8h
+; NONEON-NOSVE-NEXT:    zip1 v3.8h, v4.8h, v6.8h
+; NONEON-NOSVE-NEXT:    zip2 v5.8h, v5.8h, v7.8h
+; NONEON-NOSVE-NEXT:    zip2 v4.8h, v4.8h, v6.8h
+; NONEON-NOSVE-NEXT:    add v6.8h, v16.8h, v17.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    add v2.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    stp q6, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = load <32 x i16>, ptr %b
   %tmp3 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
@@ -244,6 +279,18 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -276,6 +323,18 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -298,6 +357,19 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -330,6 +402,16 @@ define void @zip_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    zip2 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %a
   %tmp2 = load <4 x i32>, ptr %b
   %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -351,6 +433,16 @@ define void @zip1_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load  volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
   store volatile <8 x i32> %tmp2, ptr %a
@@ -370,6 +462,19 @@ define void @trn_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z1.b, z2.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    trn1 v4.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    trn2 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    trn1 v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    trn2 v2.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
@@ -392,6 +497,19 @@ define void @trn_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x9, :lo12:.LCPI8_1]
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v1.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    tbl v1.16b, { v1.16b }, v2.16b
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
   %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 7, i32 2, i32 6, i32 4, i32 5, i32 1, i32 3>
@@ -414,6 +532,19 @@ define void @trn_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.h, z1.h, z2.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    trn1 v4.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    trn2 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    trn1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    trn2 v2.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -436,6 +567,19 @@ define void @trn_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.s, z1.s, z2.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    trn1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    trn2 v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v4.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
@@ -459,6 +603,19 @@ define void @trn_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    zip1 v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip2 v2.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v4.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fadd v1.2d, v1.2d, v2.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -479,6 +636,16 @@ define void @trn_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x float>, ptr %a
   %tmp2 = load <4 x float>, ptr %b
   %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -500,6 +667,18 @@ define void @trn_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z3.s, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    trn1 v3.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    trn2 v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -571,6 +750,18 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
@@ -617,6 +808,18 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -649,6 +852,18 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -668,6 +883,16 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
   store volatile <8 x i32> %tmp2, ptr %a
@@ -869,6 +1094,19 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    uzp2 v2.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
@@ -891,6 +1129,17 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
+; NONEON-NOSVE-NEXT:    ext v2.8b, v0.8b, v0.8b, #2
+; NONEON-NOSVE-NEXT:    trn1 v1.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    zip1 v0.4h, v2.4h, v0.4h
+; NONEON-NOSVE-NEXT:    add v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %a
   %tmp2 = load <4 x i16>, ptr %b
   %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
@@ -1008,6 +1257,19 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp2 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1047,6 +1309,19 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp2 v2.4s, v3.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v4.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = load <8 x float>, ptr %b
   %tmp3 = shufflevector <8 x float> %tmp1, <8 x float> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 6, i32 undef, i32 10, i32 12, i32 14>
@@ -1069,6 +1344,19 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    zip2 v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    zip2 v2.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    add v0.2d, v4.2d, v0.2d
+; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v2.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = load <4 x i64>, ptr %b
   %tmp3 = shufflevector <4 x i64> %tmp1, <4 x i64> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1136,6 +1424,16 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
   %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1174,6 +1472,15 @@ define void @uzp_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6>
   %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 1, i32 3, i32 5, i32 7>
@@ -1197,6 +1504,19 @@ define void @zip_vscale2_4(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_vscale2_4:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index 8039bd096bcb89..41d2cb8a2c7564 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -35,6 +36,23 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ptest_v16i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
@@ -92,6 +110,33 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ptest_or_v16i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
+; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orn v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
@@ -159,6 +204,33 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ptest_and_v16i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
+; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index 726fd28c90ae22..5626f77c684f22 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -18,6 +19,13 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    lsr z0.h, z0.h, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -30,6 +38,11 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -42,6 +55,11 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -55,6 +73,14 @@ define void @bitreverse_v32i8(ptr %a) {
 ; CHECK-NEXT:    rbit z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -70,6 +96,13 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -82,6 +115,12 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -94,6 +133,12 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -107,6 +152,16 @@ define void @bitreverse_v16i16(ptr %a) {
 ; CHECK-NEXT:    rbit z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -121,6 +176,12 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -133,6 +194,12 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -146,6 +213,16 @@ define void @bitreverse_v8i32(ptr %a) {
 ; CHECK-NEXT:    rbit z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -160,6 +237,12 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -172,6 +255,12 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -185,6 +274,16 @@ define void @bitreverse_v4i64(ptr %a) {
 ; CHECK-NEXT:    rbit z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -204,6 +303,12 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -216,6 +321,11 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -228,6 +338,11 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -241,6 +356,14 @@ define void @bswap_v16i16(ptr %a) {
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -255,6 +378,11 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -267,6 +395,11 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -280,6 +413,14 @@ define void @bswap_v8i32(ptr %a) {
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -294,6 +435,11 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -306,6 +452,11 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -319,6 +470,14 @@ define void @bswap_v4i64(ptr %a) {
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index c022bf85e67e93..55f4f5bae641e5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,19 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v1.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    ushr v1.4h, v1.4h, #7
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #3
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, shufflevector (<4 x i8> insertelement (<4 x i8> poison, i8 32, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer)
   ret <4 x i8> %res
 }
@@ -26,6 +40,13 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    usra v0.8b, v1.8b, #3
+; NONEON-NOSVE-NEXT:    sshr v0.8b, v0.8b, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer)
   ret <8 x i8> %res
 }
@@ -38,6 +59,13 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    usra v0.16b, v1.16b, #3
+; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer)
   ret <16 x i8> %res
 }
@@ -51,6 +79,18 @@ define void @sdiv_v32i8(ptr %a) {
 ; CHECK-NEXT:    asrd z1.b, p0/m, z1.b, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v2.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    cmlt v3.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    usra v0.16b, v2.16b, #3
+; NONEON-NOSVE-NEXT:    usra v1.16b, v3.16b, #3
+; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
+; NONEON-NOSVE-NEXT:    sshr v1.16b, v1.16b, #5
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer)
   store <32 x i8> %res, ptr %a
@@ -66,6 +106,20 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v1.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    ushr v1.2s, v1.2s, #26
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, shufflevector (<2 x i16> insertelement (<2 x i16> poison, i16 32, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer)
   ret <2 x i16> %res
 }
@@ -78,6 +132,13 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #11
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer)
   ret <4 x i16> %res
 }
@@ -90,6 +151,13 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.8h, v0.8h, #0
+; NONEON-NOSVE-NEXT:    usra v0.8h, v1.8h, #11
+; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer)
   ret <8 x i16> %res
 }
@@ -103,6 +171,18 @@ define void @sdiv_v16i16(ptr %a) {
 ; CHECK-NEXT:    asrd z1.h, p0/m, z1.h, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v2.8h, v0.8h, #0
+; NONEON-NOSVE-NEXT:    cmlt v3.8h, v1.8h, #0
+; NONEON-NOSVE-NEXT:    usra v0.8h, v2.8h, #11
+; NONEON-NOSVE-NEXT:    usra v1.8h, v3.8h, #11
+; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
+; NONEON-NOSVE-NEXT:    sshr v1.8h, v1.8h, #5
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer)
   store <16 x i16> %res, ptr %a
@@ -117,6 +197,13 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    usra v0.2s, v1.2s, #27
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer)
   ret <2 x i32> %res
 }
@@ -129,6 +216,13 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.4s, v0.4s, #0
+; NONEON-NOSVE-NEXT:    usra v0.4s, v1.4s, #27
+; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer)
   ret <4 x i32> %res
 }
@@ -142,6 +236,18 @@ define void @sdiv_v8i32(ptr %a) {
 ; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v2.4s, v0.4s, #0
+; NONEON-NOSVE-NEXT:    cmlt v3.4s, v1.4s, #0
+; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #27
+; NONEON-NOSVE-NEXT:    usra v1.4s, v3.4s, #27
+; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
+; NONEON-NOSVE-NEXT:    sshr v1.4s, v1.4s, #5
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer)
   store <8 x i32> %res, ptr %a
@@ -156,6 +262,13 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt d1, d0, #0
+; NONEON-NOSVE-NEXT:    usra d0, d1, #59
+; NONEON-NOSVE-NEXT:    sshr d0, d0, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer)
   ret <1 x i64> %res
 }
@@ -169,6 +282,13 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.2d, v0.2d, #0
+; NONEON-NOSVE-NEXT:    usra v0.2d, v1.2d, #59
+; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer)
   ret <2 x i64> %res
 }
@@ -182,6 +302,18 @@ define void @sdiv_v4i64(ptr %a) {
 ; CHECK-NEXT:    asrd z1.d, p0/m, z1.d, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v2.2d, v0.2d, #0
+; NONEON-NOSVE-NEXT:    cmlt v3.2d, v1.2d, #0
+; NONEON-NOSVE-NEXT:    usra v0.2d, v2.2d, #59
+; NONEON-NOSVE-NEXT:    usra v1.2d, v3.2d, #59
+; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
+; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #5
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
index 649b13fa8a1e35..e15529e1926ac7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
@@ -15,6 +16,11 @@ define <4 x i8> @splat_v4i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.4h, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <4 x i8> %insert, <4 x i8> undef, <4 x i32> zeroinitializer
   ret <4 x i8> %splat
@@ -26,6 +32,11 @@ define <8 x i8> @splat_v8i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.8b, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer
   ret <8 x i8> %splat
@@ -37,6 +48,11 @@ define <16 x i8> @splat_v16i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.16b, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %splat
@@ -48,6 +64,12 @@ define void @splat_v32i8(i8 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.16b, w0
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   store <32 x i8> %splat, ptr %b
@@ -60,6 +82,11 @@ define <2 x i16> @splat_v2i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <2 x i16> %insert, <2 x i16> undef, <2 x i32> zeroinitializer
   ret <2 x i16> %splat
@@ -71,6 +98,11 @@ define <4 x i16> @splat_v4i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.4h, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
   ret <4 x i16> %splat
@@ -82,6 +114,11 @@ define <8 x i16> @splat_v8i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.8h, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %splat
@@ -93,6 +130,12 @@ define void @splat_v16i16(i16 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.8h, w0
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   store <16 x i16> %splat, ptr %b
@@ -105,6 +148,11 @@ define <2 x i32> @splat_v2i32(i32 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer
   ret <2 x i32> %splat
@@ -116,6 +164,11 @@ define <4 x i32> @splat_v4i32(i32 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.4s, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %splat
@@ -127,6 +180,12 @@ define void @splat_v8i32(i32 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.4s, w0
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   store <8 x i32> %splat, ptr %b
@@ -139,6 +198,11 @@ define <1 x i64> @splat_v1i64(i64 %a) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov d0, x0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer
   ret <1 x i64> %splat
@@ -150,6 +214,11 @@ define <2 x i64> @splat_v2i64(i64 %a) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.2d, x0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat
@@ -161,6 +230,12 @@ define void @splat_v4i64(i64 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.2d, x0
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   store <4 x i64> %splat, ptr %b
@@ -178,6 +253,12 @@ define <2 x half> @splat_v2f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x half> undef, half %a, i64 0
   %splat = shufflevector <2 x half> %insert, <2 x half> undef, <2 x i32> zeroinitializer
   ret <2 x half> %splat
@@ -190,6 +271,12 @@ define <4 x half> @splat_v4f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x half> undef, half %a, i64 0
   %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer
   ret <4 x half> %splat
@@ -202,6 +289,12 @@ define <8 x half> @splat_v8f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x half> undef, half %a, i64 0
   %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer
   ret <8 x half> %splat
@@ -214,6 +307,13 @@ define void @splat_v16f16(half %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half %a, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
   store <16 x half> %splat, ptr %b
@@ -227,6 +327,12 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x float> undef, float %a, i64 0
   %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer
   ret <2 x float> %splat
@@ -239,6 +345,12 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x float> undef, float %a, i64 0
   %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %splat
@@ -251,6 +363,13 @@ define void @splat_v8f32(float %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float %a, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
   store <8 x float> %splat, ptr %b
@@ -261,6 +380,10 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) {
 ; CHECK-LABEL: splat_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x double> undef, double %a, i64 0
   %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer
   ret <1 x double> %splat
@@ -273,6 +396,12 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x double> undef, double %a, i64 0
   %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer
   ret <2 x double> %splat
@@ -285,6 +414,13 @@ define void @splat_v4f64(double %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double %a, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
   store <4 x double> %splat, ptr %b
@@ -301,6 +437,12 @@ define void @splat_imm_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #1 // =0x1
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #1
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 1, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   store <32 x i8> %splat, ptr %a
@@ -313,6 +455,13 @@ define void @splat_imm_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #2 // =0x2
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #2 // =0x2
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 2, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   store <16 x i16> %splat, ptr %a
@@ -325,6 +474,13 @@ define void @splat_imm_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #3 // =0x3
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #3 // =0x3
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 3, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   store <8 x i32> %splat, ptr %a
@@ -337,6 +493,13 @@ define void @splat_imm_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #4 // =0x4
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #4 // =0x4
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 4, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   store <4 x i64> %splat, ptr %a
@@ -353,6 +516,13 @@ define void @splat_imm_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmov z0.h, #5.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #17664 // =0x4500
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half 5.0, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
   store <16 x half> %splat, ptr %a
@@ -365,6 +535,12 @@ define void @splat_imm_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmov z0.s, #6.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov v0.4s, #6.00000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float 6.0, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
   store <8 x float> %splat, ptr %a
@@ -377,6 +553,12 @@ define void @splat_imm_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmov z0.d, #7.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov v0.2d, #7.00000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double 7.0, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
   store <4 x double> %splat, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index c7435bdbec9497..f055061b13bed6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -12,6 +13,11 @@ define void @store_v4i8(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -22,6 +28,12 @@ define void @store_v8i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -32,6 +44,12 @@ define void @store_v16i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -42,6 +60,12 @@ define void @store_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -53,6 +77,11 @@ define void @store_v2i16(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -64,6 +93,11 @@ define void @store_v2f16(ptr %a) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -74,6 +108,12 @@ define void @store_v4i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -84,6 +124,12 @@ define void @store_v4f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -94,6 +140,12 @@ define void @store_v8i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -104,6 +156,12 @@ define void @store_v8f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -114,6 +172,12 @@ define void @store_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -124,6 +188,12 @@ define void @store_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -133,6 +203,11 @@ define void @store_v2i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -142,6 +217,11 @@ define void @store_v2f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -151,6 +231,11 @@ define void @store_v4i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -160,6 +245,11 @@ define void @store_v4f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -170,6 +260,12 @@ define void @store_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -180,6 +276,12 @@ define void @store_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -190,6 +292,12 @@ define void @store_v1i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <1 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -200,6 +308,12 @@ define void @store_v1f64(ptr %a) {
 ; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <1 x double> zeroinitializer, ptr %a
   ret void
 }
@@ -209,6 +323,11 @@ define void @store_v2i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -218,6 +337,11 @@ define void @store_v2f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x double> zeroinitializer, ptr %a
   ret void
 }
@@ -228,6 +352,12 @@ define void @store_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -238,6 +368,12 @@ define void @store_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x double> zeroinitializer, ptr %a
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
index 9e04fc236836cc..80c9ef87e9b915 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 ; Test we can code generater patterns of the form:
@@ -23,6 +24,12 @@ define void @subvector_v4i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i8>, ptr %in
   br label %bb1
 
@@ -37,6 +44,12 @@ define void @subvector_v8i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %in
   br label %bb1
 
@@ -51,6 +64,12 @@ define void @subvector_v16i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v16i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %in
   br label %bb1
 
@@ -65,6 +84,12 @@ define void @subvector_v32i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v32i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   br label %bb1
 
@@ -81,6 +106,12 @@ define void @subvector_v2i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i16>, ptr %in
   br label %bb1
 
@@ -95,6 +126,12 @@ define void @subvector_v4i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %in
   br label %bb1
 
@@ -109,6 +146,12 @@ define void @subvector_v8i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %in
   br label %bb1
 
@@ -123,6 +166,12 @@ define void @subvector_v16i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v16i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   br label %bb1
 
@@ -138,6 +187,12 @@ define void @subvector_v2i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2i32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %in
   br label %bb1
 
@@ -152,6 +207,12 @@ define void @subvector_v4i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %in
   br label %bb1
 
@@ -166,6 +227,12 @@ define void @subvector_v8i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8i32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   br label %bb1
 
@@ -181,6 +248,12 @@ define void @subvector_v2i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2i64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %in
   br label %bb1
 
@@ -195,6 +268,12 @@ define void @subvector_v4i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   br label %bb1
 
@@ -210,6 +289,12 @@ define void @subvector_v2f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x half>, ptr %in
   br label %bb1
 
@@ -224,6 +309,12 @@ define void @subvector_v4f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %in
   br label %bb1
 
@@ -238,6 +329,12 @@ define void @subvector_v8f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %in
   br label %bb1
 
@@ -252,6 +349,12 @@ define void @subvector_v16f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v16f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %in
   br label %bb1
 
@@ -267,6 +370,12 @@ define void @subvector_v2f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2f32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %in
   br label %bb1
 
@@ -281,6 +390,12 @@ define void @subvector_v4f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4f32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %in
   br label %bb1
 
@@ -295,6 +410,12 @@ define void @subvector_v8f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8f32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>,ptr %in
   br label %bb1
 
@@ -310,6 +431,12 @@ define void @subvector_v2f64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2f64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %in
   br label %bb1
 
@@ -324,6 +451,12 @@ define void @subvector_v4f64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4f64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %in
   br label %bb1
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index b34fe438a063a9..41b68e10e75ded 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -12,6 +13,13 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v8i16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = trunc <8 x i16> %a to <8 x i8>
   store <8 x i8> %val, ptr %dest
@@ -25,6 +33,14 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v4i32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i8>
   store <4 x i8> %val, ptr %dest
@@ -38,6 +54,13 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v4i32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i16>
   store <4 x i16> %val, ptr %dest
@@ -51,6 +74,13 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v2i64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = trunc <2 x i64> %a to <2 x i32>
   store <2 x i32> %val, ptr %dest
@@ -66,6 +96,14 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v2i256i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
+; NONEON-NOSVE-NEXT:    str q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i256>, ptr %ap
   %val = trunc <2 x i256> %a to <2 x i64>
   store <2 x i64> %val, ptr %dest
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 9e56462df38890..8242b4e26d5057 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,6 +20,12 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i16_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = trunc <16 x i16> %a to <16 x i8>
   ret <16 x i8> %b
@@ -41,6 +48,17 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.b, z2.b, z2.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i16>, ptr %in
   %b = trunc <32 x i16> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -76,6 +94,24 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v64i16_v64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v4.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v6.16b, v1.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i16>, ptr %in
   %b = trunc <64 x i16> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -133,6 +169,38 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
+; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v5.16b, v4.16b
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v7.16b, v6.16b
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v16.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uzp1 v5.16b, v17.16b, v5.16b
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v4.16b, v4.16b, v4.16b
+; NONEON-NOSVE-NEXT:    uzp1 v7.16b, v18.16b, v7.16b
+; NONEON-NOSVE-NEXT:    add v3.16b, v6.16b, v6.16b
+; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v17.16b, v16.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
+; NONEON-NOSVE-NEXT:    add v0.16b, v5.16b, v5.16b
+; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v4.16b, v7.16b, v7.16b
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    add v1.16b, v6.16b, v6.16b
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <128 x i16>, ptr %in
   %b = trunc <128 x i16> %a to <128 x i8>
   %c = add <128 x i8> %b, %b
@@ -155,6 +223,13 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i32_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i8>
   ret <8 x i8> %b
@@ -178,6 +253,15 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i32_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i8>
   ret <16 x i8> %b
@@ -215,6 +299,23 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.b, z3.b, z3.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i32_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v7.8h, v6.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -279,6 +380,36 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
+; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v16.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v19.8h, v18.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v4.16b, v6.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v1.16b, v7.16b
+; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v3.16b
+; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -300,6 +431,12 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i32_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i16>
   ret <8 x i16> %b
@@ -322,6 +459,17 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.h, z2.h, z2.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -357,6 +505,24 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i32_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v6.8h, v1.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -414,6 +580,38 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
+; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v16.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v4.8h, v4.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v18.8h, v7.8h
+; NONEON-NOSVE-NEXT:    add v3.8h, v6.8h, v6.8h
+; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v17.8h, v16.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
+; NONEON-NOSVE-NEXT:    add v0.8h, v5.8h, v5.8h
+; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v4.8h, v7.8h, v7.8h
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    add v1.8h, v6.8h, v6.8h
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i16>
   %c = add <64 x i16> %b, %b
@@ -437,6 +635,13 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v4i64_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i8>
   ret <4 x i8> %b
@@ -461,6 +666,16 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i64_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i8>
   ret <8 x i8> %b
@@ -499,6 +714,21 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i64_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i8>
   ret <16 x i8> %b
@@ -565,6 +795,35 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #160]
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v16.4s, v17.4s, v16.4s
+; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v19.4s, v18.4s
+; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v21.4s, v20.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v4.8h, v16.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v2.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v5.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -587,6 +846,13 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v4i64_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i16>
   ret <4 x i16> %b
@@ -610,6 +876,15 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i64_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i16>
   ret <8 x i16> %b
@@ -647,6 +922,23 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.h, z3.h, z3.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i64_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -711,6 +1003,36 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v16.4s, v7.4s
+; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v19.4s, v18.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v4.8h, v6.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v3.8h
+; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -732,6 +1054,12 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v4i64_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i32>
   ret <4 x i32> %b
@@ -754,6 +1082,17 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.s, z2.s, z2.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i32>
   %c = add <8 x i32> %b, %b
@@ -789,6 +1128,24 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i64_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v6.4s, v1.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i32>
   %c = add <16 x i32> %b, %b
@@ -846,6 +1203,38 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v16.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v18.4s, v7.4s
+; NONEON-NOSVE-NEXT:    add v3.4s, v6.4s, v6.4s
+; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v17.4s, v16.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
+; NONEON-NOSVE-NEXT:    add v0.4s, v5.4s, v5.4s
+; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    add v4.4s, v7.4s, v7.4s
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    add v1.4s, v6.4s, v6.4s
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i32>
   %c = add <32 x i32> %b, %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index 304823c9e64145..874af15e211177 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,12 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
+; NONEON-NOSVE-NEXT:    trn1 v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
   ret <4 x i8> %ret
 }
@@ -28,6 +35,11 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    insr z1.b, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #7
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i8> %ret
 }
@@ -42,6 +54,11 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    insr z1.b, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
                                                                    i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
   ret <16 x i8> %ret
@@ -60,6 +77,15 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.b, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #15
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %ret = shufflevector <32 x i8> %op1, <32 x i8> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -78,6 +104,11 @@ define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> <i32 1, i32 0>
   ret <2 x i16> %ret
 }
@@ -92,6 +123,11 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    insr z1.h, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %ret
 }
@@ -106,6 +142,11 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    insr z1.h, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i16> %ret
 }
@@ -123,6 +164,15 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.h, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %ret = shufflevector <16 x i16> %op1, <16 x i16> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -141,6 +191,11 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    insr z1.s, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %ret
 }
@@ -155,6 +210,11 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    insr z1.s, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i32> %ret
 }
@@ -172,6 +232,15 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.s, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %ret = shufflevector <8 x i32> %op1, <8 x i32> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -189,6 +258,11 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    insr z1.d, x8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %ret
 }
@@ -206,6 +280,15 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, x8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %ret = shufflevector <4 x i64> %op1, <4 x i64> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -223,6 +306,11 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    insr z0.h, h2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x half> %ret
 }
@@ -236,6 +324,11 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    insr z0.h, h2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x half> %ret
 }
@@ -251,6 +344,15 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.h, h2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %ret = shufflevector <16 x half> %op1, <16 x half> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -268,6 +370,11 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2)
 ; CHECK-NEXT:    insr z0.s, s2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %ret
 }
@@ -281,6 +388,11 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
 ; CHECK-NEXT:    insr z0.s, s2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x float> %ret
 }
@@ -296,6 +408,15 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.s, s2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %ret = shufflevector <8 x float> %op1, <8 x float> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -312,6 +433,11 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
 ; CHECK-NEXT:    insr z0.d, d2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %ret
 }
@@ -327,6 +453,15 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, d2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -345,6 +480,15 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, d2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_reverse:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
@@ -359,6 +503,13 @@ define void @shuffle_ext_invalid(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_invalid:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
index 6c9c0556056684..e69f59aedc026f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -11,6 +12,11 @@ define fp128 @test_streaming_compatible_register_mov(fp128 %q0, fp128 %q1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_streaming_compatible_register_mov:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   ret fp128 %q1
 }
 
@@ -20,6 +26,11 @@ define double @fp_zero_constant() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fp_zero_constant:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov d0, xzr
+; NONEON-NOSVE-NEXT:    ret
   ret double 0.0
 }
 
@@ -29,6 +40,11 @@ define <2 x i64> @fixed_vec_zero_constant() {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fixed_vec_zero_constant:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    ret
   ret <2 x i64> zeroinitializer
 }
 
@@ -38,5 +54,10 @@ define <2 x double> @fixed_vec_fp_zero_constant() {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    ret
   ret <2 x double> <double 0.0, double 0.0>
 }

From 3864bfd2e0ce7e32fd623c550660885599383e6a Mon Sep 17 00:00:00 2001
From: Daniil Fukalov <1671137+dfukalov@users.noreply.github.com>
Date: Tue, 28 May 2024 16:09:53 +0200
Subject: [PATCH 16/89] [IR] Fix ignoring `non-global-value-max-name-size` in
 `ValueSymbolTable::makeUniqueName()`. (#89057)

E.g. during inlining new symbol name can be duplicated and then
`ValueSymbolTable::makeUniqueName()` will add unique suffix, exceeding
the `non-global-value-max-name-size` restriction.

Also fixed `unsigned` type of the option to `int` since `ValueSymbolTable`'
constructor can use `-1` value that means unrestricted name size.
---
 llvm/lib/IR/Function.cpp                      |  2 +-
 llvm/lib/IR/ValueSymbolTable.cpp              | 33 ++++++++++++-------
 .../non-global-value-max-name-size-2.ll       | 23 +++++++++++++
 llvm/test/Bitcode/value-with-long-name-dbg.ll | 11 +++++++
 llvm/test/Bitcode/value-with-long-name.ll     |  4 +--
 5 files changed, 58 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Assembler/non-global-value-max-name-size-2.ll
 create mode 100644 llvm/test/Bitcode/value-with-long-name-dbg.ll

diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index bd06ff82a15a58..13fa1afeaaff24 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -79,7 +79,7 @@ using ProfileCount = Function::ProfileCount;
 // are not in the public header file...
 template class llvm::SymbolTableListTraits<BasicBlock>;
 
-static cl::opt<unsigned> NonGlobalValueMaxNameSize(
+static cl::opt<int> NonGlobalValueMaxNameSize(
     "non-global-value-max-name-size", cl::Hidden, cl::init(1024),
     cl::desc("Maximum size for the name of non-global values."));
 
diff --git a/llvm/lib/IR/ValueSymbolTable.cpp b/llvm/lib/IR/ValueSymbolTable.cpp
index 52f7ddcdc65a2b..a020acf22a96c5 100644
--- a/llvm/lib/IR/ValueSymbolTable.cpp
+++ b/llvm/lib/IR/ValueSymbolTable.cpp
@@ -43,23 +43,34 @@ ValueSymbolTable::~ValueSymbolTable() {
 ValueName *ValueSymbolTable::makeUniqueName(Value *V,
                                             SmallString<256> &UniqueName) {
   unsigned BaseSize = UniqueName.size();
+  bool AppenDot = false;
+  if (auto *GV = dyn_cast<GlobalValue>(V)) {
+    // A dot is appended to mark it as clone during ABI demangling so that
+    // for example "_Z1fv" and "_Z1fv.1" both demangle to "f()", the second
+    // one being a clone.
+    // On NVPTX we cannot use a dot because PTX only allows [A-Za-z0-9_$] for
+    // identifiers. This breaks ABI demangling but at least ptxas accepts and
+    // compiles the program.
+    const Module *M = GV->getParent();
+    if (!(M && Triple(M->getTargetTriple()).isNVPTX()))
+      AppenDot = true;
+  }
+
   while (true) {
     // Trim any suffix off and append the next number.
     UniqueName.resize(BaseSize);
     raw_svector_ostream S(UniqueName);
-    if (auto *GV = dyn_cast<GlobalValue>(V)) {
-      // A dot is appended to mark it as clone during ABI demangling so that
-      // for example "_Z1fv" and "_Z1fv.1" both demangle to "f()", the second
-      // one being a clone.
-      // On NVPTX we cannot use a dot because PTX only allows [A-Za-z0-9_$] for
-      // identifiers. This breaks ABI demangling but at least ptxas accepts and
-      // compiles the program.
-      const Module *M = GV->getParent();
-      if (!(M && Triple(M->getTargetTriple()).isNVPTX()))
-        S << ".";
-    }
+    if (AppenDot)
+      S << ".";
     S << ++LastUnique;
 
+    // Retry if MaxNameSize has been exceeded.
+    if (MaxNameSize > -1 && UniqueName.size() > (size_t)MaxNameSize) {
+      assert(BaseSize >= UniqueName.size() - (size_t)MaxNameSize &&
+             "Can't generate unique name: MaxNameSize is too small.");
+      BaseSize -= UniqueName.size() - (size_t)MaxNameSize;
+      continue;
+    }
     // Try insert the vmap entry with this suffix.
     auto IterBool = vmap.insert(std::make_pair(UniqueName.str(), V));
     if (IterBool.second)
diff --git a/llvm/test/Assembler/non-global-value-max-name-size-2.ll b/llvm/test/Assembler/non-global-value-max-name-size-2.ll
new file mode 100644
index 00000000000000..5eac003ddb4383
--- /dev/null
+++ b/llvm/test/Assembler/non-global-value-max-name-size-2.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -S -passes='always-inline' -non-global-value-max-name-size=5 | opt -non-global-value-max-name-size=5 -passes=verify -disable-output
+
+; Opt should not generate too long name for labels during inlining.
+
+define internal i32 @inner(i32 %flag) alwaysinline {
+entry:
+  %icmp = icmp slt i32 %flag, 0
+  br i1 %icmp, label %one, label %two
+
+one:
+  ret i32 42
+
+two:
+  ret i32 44
+}
+
+define i32 @outer(i32 %x) {
+entry:
+  %call1 = call i32 @inner(i32 %x)
+  %call2 = call i32 @inner(i32 %x)
+  %ret = add i32 %call1, %call2
+  ret i32 %ret
+}
\ No newline at end of file
diff --git a/llvm/test/Bitcode/value-with-long-name-dbg.ll b/llvm/test/Bitcode/value-with-long-name-dbg.ll
new file mode 100644
index 00000000000000..0cc3569d8617b3
--- /dev/null
+++ b/llvm/test/Bitcode/value-with-long-name-dbg.ll
@@ -0,0 +1,11 @@
+; REQUIRES: asserts
+; Force the size to be small to check assertion message.
+; RUN: not --crash opt -S %s -O2 -o - -non-global-value-max-name-size=0 2>&1 | FileCheck %s
+; CHECK: Can't generate unique name: MaxNameSize is too small.
+
+define i32 @f(i32 %a, i32 %b) {
+  %c = add i32 %a, %b
+  %d = add i32 %c, %a
+  %e = add i32 %d, %b
+  ret i32 %e
+}
diff --git a/llvm/test/Bitcode/value-with-long-name.ll b/llvm/test/Bitcode/value-with-long-name.ll
index 1ca5d133e09ae3..aa7da5f5b7dba9 100644
--- a/llvm/test/Bitcode/value-with-long-name.ll
+++ b/llvm/test/Bitcode/value-with-long-name.ll
@@ -1,10 +1,10 @@
 ; Check the size of generated variable when no option is set
 ; RUN: opt -S %s -O2 -o - | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=-1 | FileCheck -check-prefix=CHECK-LONG %s
 ; CHECK-LONG: %{{[a-z]{4}[a-z]+}}
 
 ; Then check we correctly cap the size of newly generated non-global values name
 ; Force the size to be small so that the check works on release and debug build
-; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=0 | FileCheck -check-prefix=CHECK-SHORT %s
 ; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=1 | FileCheck -check-prefix=CHECK-SHORT %s
 ; CHECK-SHORT-NOT: %{{[a-z][a-z]+}}
 
@@ -14,5 +14,3 @@ define i32 @f(i32 %a, i32 %b) {
   %e = add i32 %d, %b
   ret i32 %e
 }
-
-

From d2a103e682d65c3bfdff1d6a6f7b114e6cf4ff76 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 28 May 2024 07:31:29 -0700
Subject: [PATCH 17/89] [memprof] Remove const from the return type of
 toMemProfRecord (#93415)

"const" being removed in this patch prevents the move semantics from
being used in:

  AI.CallStack = Callback(IndexedAI.CSId);

With this patch on an indexed MemProf Version 2 profile, the cycle
count and instruction count go down by 13.3% and 26.3%, respectively,
with "llvm-profdata show" modified to deserialize all MemProfRecords.
---
 llvm/include/llvm/ProfileData/MemProf.h | 4 ++--
 llvm/lib/ProfileData/MemProf.cpp        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 66a99f16cdb638..17cef15344285b 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -426,8 +426,8 @@ struct IndexedMemProfRecord {
   // Convert IndexedMemProfRecord to MemProfRecord.  Callback is used to
   // translate CallStackId to call stacks with frames inline.
   MemProfRecord toMemProfRecord(
-      llvm::function_ref<const llvm::SmallVector<Frame>(const CallStackId)>
-          Callback) const;
+      llvm::function_ref<llvm::SmallVector<Frame>(const CallStackId)> Callback)
+      const;
 
   // Returns the GUID for the function name after canonicalization. For
   // memprof, we remove any .llvm suffix added by LTO. MemProfRecords are
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index e5608644519db4..89afe7c39027c6 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -245,8 +245,8 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
 }
 
 MemProfRecord IndexedMemProfRecord::toMemProfRecord(
-    llvm::function_ref<const llvm::SmallVector<Frame>(const CallStackId)>
-        Callback) const {
+    llvm::function_ref<llvm::SmallVector<Frame>(const CallStackId)> Callback)
+    const {
   MemProfRecord Record;
 
   Record.AllocSites.reserve(AllocSites.size());

From 74ed79f7f123788d95f1552800e1af9ceaee4a08 Mon Sep 17 00:00:00 2001
From: Ryan Holt <ryanholt@mathworks.com>
Date: Tue, 28 May 2024 10:42:32 -0400
Subject: [PATCH 18/89] [mlir][linalg] Add linalg.transpose constant folding
 (#92589)

There was existing support for constant folding a `linalg.generic` that
was actually a transpose. This commit adds support for the named op,
`linalg.transpose`, as well by making use of the `LinalgOp` interface.
---
 .../Linalg/Transforms/ConstantFold.cpp        |  62 ++++----
 mlir/test/Dialect/Linalg/constant-fold.mlir   | 148 ++++++++++++++++++
 .../Linalg/fusion-elementwise-ops.mlir        | 133 ----------------
 3 files changed, 180 insertions(+), 163 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/constant-fold.mlir

diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp
index 8fffabf11f3fdd..2e6079e1402e1d 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp
@@ -23,21 +23,21 @@ using namespace mlir;
 using namespace mlir::linalg;
 
 namespace {
-/// Base class for constant folding linalg.generic ops with N inputs, 1 output,
-/// and permutation indexing maps.
+/// Base class for constant folding linalg structured ops with N inputs, 1
+/// output, and permutation indexing maps.
 ///
 /// `ConcreteType` should provide methods with signatures
 ///
 /// ```c++
-///   bool matchIndexingMaps(GenericOp genericOp) const;
-///   RegionComputationFn getRegionComputeFn(GenericOp) const;
+///   bool matchIndexingMaps(LinalgOp linalgOp) const;
+///   RegionComputationFn getRegionComputeFn(LinalgOp) const;
 /// ```
 ///
 /// The latter inspects the region and returns the computation inside as a
 /// functor. The functor will be invoked with constant elements for all inputs
 /// and should return the corresponding computed constant element for output.
 template <typename ConcreteType>
-class FoldConstantBase : public OpRewritePattern<GenericOp> {
+class FoldConstantBase : public OpInterfaceRewritePattern<LinalgOp> {
 public:
   struct APIntOrFloat {
     std::optional<APInt> apInt;
@@ -52,25 +52,26 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
 
   FoldConstantBase(MLIRContext *context, const ControlFusionFn &controlFn,
                    PatternBenefit benefit = 1)
-      : OpRewritePattern<GenericOp>(context, benefit), controlFn(controlFn) {}
+      : OpInterfaceRewritePattern<LinalgOp>(context, benefit),
+        controlFn(controlFn) {}
 
-  LogicalResult matchAndRewrite(GenericOp genericOp,
+  LogicalResult matchAndRewrite(LinalgOp linalgOp,
                                 PatternRewriter &rewriter) const override {
     // Mixed and buffer sematics aren't supported.
-    if (!genericOp.hasPureTensorSemantics())
+    if (!linalgOp.hasPureTensorSemantics())
       return failure();
 
     // Only support ops generating one output for now.
-    if (genericOp.getNumDpsInits() != 1)
+    if (linalgOp.getNumDpsInits() != 1)
       return failure();
 
-    auto outputType = dyn_cast<ShapedType>(genericOp.getResultTypes().front());
+    auto outputType = dyn_cast<ShapedType>(linalgOp->getResultTypes().front());
     // Require the output types to be static given that we are generating
     // constants.
     if (!outputType || !outputType.hasStaticShape())
       return failure();
 
-    if (!llvm::all_of(genericOp.getInputs(), [](Value input) {
+    if (!llvm::all_of(linalgOp.getDpsInputs(), [](Value input) {
           return isa<ShapedType>(input.getType());
         }))
       return failure();
@@ -80,7 +81,7 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
       return cast<ShapedType>(value.getType()).getElementType();
     };
     if (!llvm::all_equal(
-            llvm::map_range(genericOp->getOperands(), getOperandElementType)))
+            llvm::map_range(linalgOp->getOperands(), getOperandElementType)))
       return failure();
 
     // We can only handle the case where we have int/float elements.
@@ -93,30 +94,30 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
     // entirely in the compiler, without needing to turn all indices into
     // Values, and then do affine apply on them, and then match back the
     // constant again.
-    if (!llvm::all_of(genericOp.getIndexingMapsArray(),
+    if (!llvm::all_of(linalgOp.getIndexingMapsArray(),
                       [](AffineMap map) { return map.isPermutation(); }))
       return failure();
 
-    for (OpOperand &operand : genericOp.getDpsInitsMutable()) {
-      if (genericOp.payloadUsesValueFromOperand(&operand))
+    for (OpOperand &operand : linalgOp.getDpsInitsMutable()) {
+      if (linalgOp.payloadUsesValueFromOperand(&operand))
         return failure();
     }
 
     // Further check the indexing maps are okay for the ConcreteType.
-    if (!static_cast<const ConcreteType *>(this)->matchIndexingMaps(genericOp))
+    if (!static_cast<const ConcreteType *>(this)->matchIndexingMaps(linalgOp))
       return failure();
 
     // Defer to the concrete type to check the region and discover the
     // computation inside.
     RegionComputationFn computeFn =
-        static_cast<const ConcreteType *>(this)->getRegionComputeFn(genericOp);
+        static_cast<const ConcreteType *>(this)->getRegionComputeFn(linalgOp);
     if (!computeFn)
       return failure();
 
     // All inputs should be constants.
-    int numInputs = genericOp.getNumDpsInputs();
+    int numInputs = linalgOp.getNumDpsInputs();
     SmallVector<DenseIntOrFPElementsAttr> inputValues(numInputs);
-    for (const auto &en : llvm::enumerate(genericOp.getDpsInputOperands())) {
+    for (const auto &en : llvm::enumerate(linalgOp.getDpsInputOperands())) {
       if (!matchPattern(en.value()->get(),
                         m_Constant(&inputValues[en.index()])))
         return failure();
@@ -124,12 +125,11 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
 
     // Identified this as a potential candidate for folding. Now check the
     // policy to see whether we are allowed to proceed.
-    for (OpOperand *operand : genericOp.getDpsInputOperands()) {
+    for (OpOperand *operand : linalgOp.getDpsInputOperands()) {
       if (!controlFn(operand))
         return failure();
     }
 
-    auto linalgOp = cast<LinalgOp>(genericOp.getOperation());
     SmallVector<int64_t, 4> loopBounds = linalgOp.computeStaticLoopSizes();
     int64_t numElements = outputType.getNumElements();
 
@@ -155,8 +155,8 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
 
     SmallVector<SmallVector<unsigned>> inputDims;
     for (int i = 0; i < numInputs; ++i)
-      inputDims.push_back(getDimPositions(genericOp.getIndexingMapsArray()[i]));
-    auto outputDims = getDimPositions(genericOp.getIndexingMapsArray().back());
+      inputDims.push_back(getDimPositions(linalgOp.getIndexingMapsArray()[i]));
+    auto outputDims = getDimPositions(linalgOp.getIndexingMapsArray().back());
     auto outputShape = outputType.getShape();
 
     // Allocate small vectors for index delinearization. Initial values do not
@@ -173,7 +173,7 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
     APIntOrFloatArray computeFnInputs;
 
     auto inputShapes = llvm::to_vector<4>(
-        llvm::map_range(genericOp.getInputs(), [](Value value) {
+        llvm::map_range(linalgOp.getDpsInputs(), [](Value value) {
           return cast<ShapedType>(value.getType()).getShape();
         }));
 
@@ -254,7 +254,7 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
         isFloat ? DenseElementsAttr::get(outputType, fpOutputValues)
                 : DenseElementsAttr::get(outputType, intOutputValues);
 
-    rewriter.replaceOpWithNewOp<arith::ConstantOp>(genericOp, outputAttr);
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(linalgOp, outputAttr);
     return success();
   }
 
@@ -262,18 +262,20 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
   ControlFusionFn controlFn;
 };
 
-// Folds linalg.generic ops that are actually transposes on constant values.
+// Folds linalg.transpose (and linalg.generic ops that are actually transposes)
+// on constant values.
 struct FoldConstantTranspose : public FoldConstantBase<FoldConstantTranspose> {
+
   using FoldConstantBase::FoldConstantBase;
 
-  bool matchIndexingMaps(GenericOp genericOp) const {
+  bool matchIndexingMaps(LinalgOp linalgOp) const {
     // We should have one input and one output.
-    return genericOp.getIndexingMapsArray().size() == 2;
+    return linalgOp.getIndexingMapsArray().size() == 2;
   }
 
-  RegionComputationFn getRegionComputeFn(GenericOp genericOp) const {
+  RegionComputationFn getRegionComputeFn(LinalgOp linalgOp) const {
     // Make sure the region only contains a yield op.
-    Block &body = genericOp.getRegion().front();
+    Block &body = linalgOp->getRegion(0).front();
     if (!llvm::hasSingleElement(body))
       return nullptr;
     auto yieldOp = dyn_cast<linalg::YieldOp>(body.getTerminator());
diff --git a/mlir/test/Dialect/Linalg/constant-fold.mlir b/mlir/test/Dialect/Linalg/constant-fold.mlir
new file mode 100644
index 00000000000000..3929c26a3382f4
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/constant-fold.mlir
@@ -0,0 +1,148 @@
+// RUN: mlir-opt %s -linalg-fuse-elementwise-ops -split-input-file | FileCheck %s
+
+// CHECK-LABEL: @transpose_fold_2d_fp32
+func.func @transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
+  //               CHECK: %[[CST:.+]] = arith.constant
+  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32>
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %arg1 : f32
+  } -> tensor<3x2xf32>
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_fold_2d_fp64
+func.func @transpose_fold_2d_fp64(%init: tensor<3x2xf64>) -> tensor<3x2xf64> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf64>
+  //               CHECK: %[[CST:.+]] = arith.constant
+  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf64>
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf64>) outs(%init : tensor<3x2xf64>) {
+  ^bb0(%arg1: f64, %arg2: f64):
+    linalg.yield %arg1 : f64
+  } -> tensor<3x2xf64>
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x2xf64>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_fold_4d_i32
+func.func @transpose_fold_4d_i32(%init: tensor<3x1x4x2xi32>) -> tensor<3x1x4x2xi32> {
+  %input = arith.constant dense<[[
+    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
+    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
+  ]]> : tensor<1x2x3x4xi32>
+  //               CHECK: %[[CST:.+]] = arith.constant dense<[
+  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
+  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
+  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
+  // CHECK-SAME{LITERAL}: ]>
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
+    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
+  } ins(%input : tensor<1x2x3x4xi32>) outs(%init : tensor<3x1x4x2xi32>) {
+  ^bb0(%arg1: i32, %arg2: i32):
+    linalg.yield %arg1 : i32
+  } -> tensor<3x1x4x2xi32>
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x1x4x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_fold_4d_i16
+func.func @transpose_fold_4d_i16(%init: tensor<3x1x4x2xi16>) -> tensor<3x1x4x2xi16> {
+  %input = arith.constant dense<[[
+    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
+    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
+  ]]> : tensor<1x2x3x4xi16>
+  //               CHECK: %[[CST:.+]] = arith.constant dense<[
+  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
+  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
+  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
+  // CHECK-SAME{LITERAL}: ]>
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
+    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
+  } ins(%input : tensor<1x2x3x4xi16>) outs(%init : tensor<3x1x4x2xi16>) {
+  ^bb0(%arg1: i16, %arg2: i16):
+    linalg.yield %arg1 : i16
+  } -> tensor<3x1x4x2xi16>
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x1x4x2xi16>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_nofold_non_cst_input
+func.func @transpose_nofold_non_cst_input(%input: tensor<2x3xf32>, %init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  // CHECK: linalg.generic
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %arg1 : f32
+  } -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_nofold_yield_const
+func.func @transpose_nofold_yield_const(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
+  %cst = arith.constant 8.0 : f32
+  // CHECK: linalg.generic
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %cst : f32
+  } -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_nofold_multi_ops_in_region
+func.func @transpose_nofold_multi_ops_in_region(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
+  // CHECK: linalg.generic
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %add = arith.addf %arg1, %arg1 : f32
+    linalg.yield %add : f32
+  } -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @named_transpose_fold_2d_fp32
+func.func @named_transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
+  //               CHECK: %[[CST:.+]] = arith.constant
+  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32>
+  %1 = linalg.transpose ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) permutation = [1, 0]
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+
diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
index 15a4f6cdd3bbe4..e45a9fbb1052c1 100644
--- a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
@@ -777,139 +777,6 @@ func.func @fuse_scalar_constant(%arg0 : tensor<?x?xf32>) -> (tensor<?x?xf32>, te
 
 // -----
 
-// CHECK-LABEL: @transpose_fold_2d_fp32
-func.func @transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
-  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
-  //               CHECK: %[[CST:.+]] = arith.constant
-  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32>
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<3x2xf32>
-  // CHECK: return %[[CST]]
-  return %1 : tensor<3x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_fold_2d_fp64
-func.func @transpose_fold_2d_fp64(%init: tensor<3x2xf64>) -> tensor<3x2xf64> {
-  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf64>
-  //               CHECK: %[[CST:.+]] = arith.constant
-  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf64>
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf64>) outs(%init : tensor<3x2xf64>) {
-  ^bb0(%arg1: f64, %arg2: f64):
-    linalg.yield %arg1 : f64
-  } -> tensor<3x2xf64>
-  // CHECK: return %[[CST]]
-  return %1 : tensor<3x2xf64>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_fold_4d_i32
-func.func @transpose_fold_4d_i32(%init: tensor<3x1x4x2xi32>) -> tensor<3x1x4x2xi32> {
-  %input = arith.constant dense<[[
-    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
-    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
-  ]]> : tensor<1x2x3x4xi32>
-  //               CHECK: %[[CST:.+]] = arith.constant dense<[
-  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
-  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
-  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
-  // CHECK-SAME{LITERAL}: ]>
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
-    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
-  } ins(%input : tensor<1x2x3x4xi32>) outs(%init : tensor<3x1x4x2xi32>) {
-  ^bb0(%arg1: i32, %arg2: i32):
-    linalg.yield %arg1 : i32
-  } -> tensor<3x1x4x2xi32>
-  // CHECK: return %[[CST]]
-  return %1 : tensor<3x1x4x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_fold_4d_i16
-func.func @transpose_fold_4d_i16(%init: tensor<3x1x4x2xi16>) -> tensor<3x1x4x2xi16> {
-  %input = arith.constant dense<[[
-    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
-    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
-  ]]> : tensor<1x2x3x4xi16>
-  //               CHECK: %[[CST:.+]] = arith.constant dense<[
-  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
-  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
-  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
-  // CHECK-SAME{LITERAL}: ]>
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
-    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
-  } ins(%input : tensor<1x2x3x4xi16>) outs(%init : tensor<3x1x4x2xi16>) {
-  ^bb0(%arg1: i16, %arg2: i16):
-    linalg.yield %arg1 : i16
-  } -> tensor<3x1x4x2xi16>
-  // CHECK: return %[[CST]]
-  return %1 : tensor<3x1x4x2xi16>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_nofold_non_cst_input
-func.func @transpose_nofold_non_cst_input(%input: tensor<2x3xf32>, %init: tensor<3x2xf32>) -> tensor<3x2xf32> {
-  // CHECK: linalg.generic
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<3x2xf32>
-  return %1 : tensor<3x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_nofold_yield_const
-func.func @transpose_nofold_yield_const(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
-  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
-  %cst = arith.constant 8.0 : f32
-  // CHECK: linalg.generic
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %cst : f32
-  } -> tensor<3x2xf32>
-  return %1 : tensor<3x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_nofold_multi_ops_in_region
-func.func @transpose_nofold_multi_ops_in_region(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
-  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
-  // CHECK: linalg.generic
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %add = arith.addf %arg1, %arg1 : f32
-    linalg.yield %add : f32
-  } -> tensor<3x2xf32>
-  return %1 : tensor<3x2xf32>
-}
-
-// -----
-
 // Fusing the broadcast into a reduction would require to insert extra knowledge
 // about the size of the reduction dimension. As long, as this is not
 // implemented, we check that two linalg operations remain.

From cde1ae4c14eecd47215f04d4387845231021d939 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Tue, 28 May 2024 11:11:55 -0400
Subject: [PATCH 19/89] [lldb][NativePDB] Fix uninitialized values found by
 msan.

---
 .../source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
index fab3ca989c0ec6..17c5f6118603f4 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
@@ -47,15 +47,18 @@ UdtRecordCompleter::UdtRecordCompleter(
   CVType cvt = m_index.tpi().getType(m_id.index);
   switch (cvt.kind()) {
   case LF_ENUM:
+    m_cvr.er.Options = ClassOptions::None;
     llvm::cantFail(TypeDeserializer::deserializeAs<EnumRecord>(cvt, m_cvr.er));
     break;
   case LF_UNION:
+    m_cvr.ur.Options = ClassOptions::None;
     llvm::cantFail(TypeDeserializer::deserializeAs<UnionRecord>(cvt, m_cvr.ur));
     m_layout.bit_size = m_cvr.ur.getSize() * 8;
     m_record.record.kind = Member::Union;
     break;
   case LF_CLASS:
   case LF_STRUCTURE:
+    m_cvr.cr.Options = ClassOptions::None;
     llvm::cantFail(TypeDeserializer::deserializeAs<ClassRecord>(cvt, m_cvr.cr));
     m_layout.bit_size = m_cvr.cr.getSize() * 8;
     m_record.record.kind = Member::Struct;

From 94be801879788399a7ffa8c7cbe28f6c86e26ffe Mon Sep 17 00:00:00 2001
From: stefankoncarevic <skoncare@amd.com>
Date: Tue, 28 May 2024 17:17:02 +0200
Subject: [PATCH 20/89] [mlir][ROCDL] Update the LLVM data layout for ROCDL
 lowering. (#92127)

This change updates the dataLayout string to ensure alignment with the
latest LLVM TargetMachine configuration. The aim is to
maintain consistency and prevent potential compilation issues related to
memory address space handling.
---
 mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 6 +++---
 mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir        | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index f425b1f59d9940..70dcccf0a7307a 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -77,9 +77,9 @@ Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
 }
 static constexpr StringLiteral amdgcnDataLayout =
     "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
-    "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
-    "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
-    "G1-ni:7:8";
+    "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:"
+    "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
+    "64-S32-A5-G1-ni:7:8:9";
 
 namespace {
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 8a2d8bd7967caf..a8d61a6a0f6fd9 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -2,7 +2,8 @@
 // RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
 // CHECK-LABEL: @test_module
-// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_index_ops()
   // CHECK32-LABEL: func @gpu_index_ops()

From 26e0ce0b3633c67e09d2f3a99e0d4058a4e0a887 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Tue, 28 May 2024 17:32:27 +0200
Subject: [PATCH 21/89] [flang] update fir.box_rank and fir.is_array codegen
 (#93541)

fir.box_rank codegen was invalid, it was assuming the rank field in the
descriptor was an i32. This is not correct. Do not hard code the type,
use the named position to find the type, and convert as needed in the
patterns.
---
 flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h |  4 ++++
 flang/lib/Optimizer/CodeGen/CodeGen.cpp               |  9 ++++-----
 flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp         |  8 ++++++++
 flang/test/Fir/convert-to-llvm.fir                    |  9 +++++----
 flang/test/Fir/tbaa.fir                               | 11 ++++++-----
 5 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
index 06a44f1885656f..510ff729989145 100644
--- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
+++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
@@ -101,6 +101,10 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
                         mlir::Value box,
                         mlir::ConversionPatternRewriter &rewriter) const;
 
+  mlir::Value getRankFromBox(mlir::Location loc, TypePair boxTy,
+                             mlir::Value box,
+                             mlir::ConversionPatternRewriter &rewriter) const;
+
   // Get the element type given an LLVM type that is of the form
   // (array|struct|vector)+ and the provided indexes.
   mlir::Type getBoxEleTy(mlir::Type type,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 74e68725003cb9..664453ebaf2f74 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -391,9 +391,8 @@ struct BoxIsArrayOpConversion : public fir::FIROpConversion<fir::BoxIsArrayOp> {
     mlir::Value a = adaptor.getOperands()[0];
     auto loc = boxisarray.getLoc();
     TypePair boxTyPair = getBoxTypePair(boxisarray.getVal().getType());
-    auto rank = getValueFromBox(loc, boxTyPair, a, rewriter.getI32Type(),
-                                rewriter, kRankPosInBox);
-    auto c0 = genConstantOffset(loc, rewriter, 0);
+    mlir::Value rank = getRankFromBox(loc, boxTyPair, a, rewriter);
+    mlir::Value c0 = genConstantIndex(loc, rank.getType(), rewriter, 0);
     rewriter.replaceOpWithNewOp<mlir::LLVM::ICmpOp>(
         boxisarray, mlir::LLVM::ICmpPredicate::ne, rank, c0);
     return mlir::success();
@@ -430,8 +429,8 @@ struct BoxRankOpConversion : public fir::FIROpConversion<fir::BoxRankOp> {
     auto loc = boxrank.getLoc();
     mlir::Type ty = convertType(boxrank.getType());
     TypePair boxTyPair = getBoxTypePair(boxrank.getVal().getType());
-    auto result =
-        getValueFromBox(loc, boxTyPair, a, ty, rewriter, kRankPosInBox);
+    mlir::Value rank = getRankFromBox(loc, boxTyPair, a, rewriter);
+    mlir::Value result = integerCast(loc, rewriter, ty, rank);
     rewriter.replaceOp(boxrank, result);
     return mlir::success();
   }
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 69e78167b07333..8c726d547491a7 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -179,6 +179,14 @@ mlir::Value ConvertFIRToLLVMPattern::getElementSizeFromBox(
   return getValueFromBox(loc, boxTy, box, resultTy, rewriter, kElemLenPosInBox);
 }
 
+/// Read base address from a fir.box. Returned address has type ty.
+mlir::Value ConvertFIRToLLVMPattern::getRankFromBox(
+    mlir::Location loc, TypePair boxTy, mlir::Value box,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resultTy = getBoxEleTy(boxTy.llvm, {kRankPosInBox});
+  return getValueFromBox(loc, boxTy, box, resultTy, rewriter, kRankPosInBox);
+}
+
 // Get the element type given an LLVM type that is of the form
 // (array|struct|vector)+ and the provided indexes.
 mlir::Type ConvertFIRToLLVMPattern::getBoxEleTy(
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index 21323a5e657c94..70cb0443e9a645 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -941,7 +941,8 @@ func.func @extract_rank(%arg0: !fir.box<!fir.array<*:f64>>) -> i32 {
 // CHECK-LABEL: llvm.func @extract_rank(
 // CHECK-SAME:                          %[[ARG0:.*]]: !llvm.ptr) -> i32
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})>
-// CHECK:         %[[RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i32
+// CHECK:         %[[RAW_RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i8
+// CHECK:         %[[RANK:.*]] = llvm.sext %[[RAW_RANK]] : i8 to i32
 // CHECK:         llvm.return %[[RANK]] : i32
 
 // -----
@@ -1009,9 +1010,9 @@ func.func @box_isarray(%arg0: !fir.box<!fir.array<*:f64>>) -> i1 {
 // CHECK-LABEL: llvm.func @box_isarray(
 // CHECK-SAME:                         %[[ARG0:.*]]: !llvm.ptr) -> i1
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})>
-// CHECK:         %[[RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i32
-// CHECK:         %[[C0_ISARRAY:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:         %[[IS_ARRAY:.*]] = llvm.icmp "ne" %[[RANK]], %[[C0_ISARRAY]] : i32
+// CHECK:         %[[RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i8
+// CHECK:         %[[C0_ISARRAY:.*]] = llvm.mlir.constant(0 : i64) : i8
+// CHECK:         %[[IS_ARRAY:.*]] = llvm.icmp "ne" %[[RANK]], %[[C0_ISARRAY]] : i8
 // CHECK:         llvm.return %[[IS_ARRAY]] : i1
 
 // -----
diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir
index 048f53f5c6e47a..f4f23d35cba257 100644
--- a/flang/test/Fir/tbaa.fir
+++ b/flang/test/Fir/tbaa.fir
@@ -248,8 +248,9 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<*:f64>>) -> i32 {
 // CHECK-LABEL:   llvm.func @tbaa(
 // CHECK-SAME:                    %[[VAL_0:.*]]: !llvm.ptr) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.getelementptr %[[VAL_0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-// CHECK:           %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i32
-// CHECK:           llvm.return %[[VAL_2]] : i32
+// CHECK:           %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i8
+// CHECK:           %[[VAL_3:.*]] = llvm.sext %[[VAL_2]] : i8 to i32
+// CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 
 // -----
@@ -267,9 +268,9 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<*:f64>>) -> i1 {
 // CHECK-LABEL:   llvm.func @tbaa(
 // CHECK-SAME:                    %[[VAL_0:.*]]: !llvm.ptr) -> i1 {
 // CHECK:           %[[VAL_1:.*]] = llvm.getelementptr %[[VAL_0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-// CHECK:           %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i32
-// CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_4:.*]] = llvm.icmp "ne" %[[VAL_2]], %[[VAL_3]] : i32
+// CHECK:           %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i8
+// CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : i64) : i8
+// CHECK:           %[[VAL_4:.*]] = llvm.icmp "ne" %[[VAL_2]], %[[VAL_3]] : i8
 // CHECK:           llvm.return %[[VAL_4]] : i1
 // CHECK:         }
 

From 88902147c11f8de5cc7c792fd8c476a821664297 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Tue, 28 May 2024 10:21:40 -0500
Subject: [PATCH 22/89] [Frontend][OpenMP] Rename some variables, NFC

Rename things in a couple of places to make the code a bit clearer.
---
 .../llvm/Frontend/OpenMP/ConstructDecompositionT.h       | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 3fa27608ead948..3feb4bd11c998f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -371,9 +371,8 @@ ConstructDecompositionT<C, H>::addClauseSymsToMap(U &&item,
 // anything and return false, otherwise return true.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyToUnique(const ClauseTy *node) {
-  auto unique = detail::find_unique(leafs, [=](const auto &dirInfo) {
-    return llvm::omp::isAllowedClauseForDirective(dirInfo.id, node->id,
-                                                  version);
+  auto unique = detail::find_unique(leafs, [=](const auto &leaf) {
+    return llvm::omp::isAllowedClauseForDirective(leaf.id, node->id, version);
   });
 
   if (unique != leafs.end()) {
@@ -438,8 +437,8 @@ bool ConstructDecompositionT<C, H>::applyToAll(const ClauseTy *node) {
 }
 
 template <typename C, typename H>
-template <typename Clause>
-bool ConstructDecompositionT<C, H>::applyClause(Clause &&clause,
+template <typename Specific>
+bool ConstructDecompositionT<C, H>::applyClause(Specific &&specific,
                                                 const ClauseTy *node) {
   // The default behavior is to find the unique directive to which the
   // given clause may be applied. If there are no such directives, or

From 51dd4eaaa29683c16151f5168e7f8645acbd6e6c Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Tue, 28 May 2024 11:49:07 -0400
Subject: [PATCH 23/89]  Reapply [lldb][DWARF] Delay struct/class/union
 definition DIE searching when parsing declaration DIEs. (#92328)

This reapplies
https://github.com/llvm/llvm-project/commit/9a7262c2601874e5aa64c5db19746770212d4b44
(#90663) and added https://github.com/llvm/llvm-project/pull/91808 as a
fix.

It was causing tests on macos to fail because
`SymbolFileDWARF::GetForwardDeclCompilerTypeToDIE` returned the map
owned by this symol file. When there were two symbol files, two
different maps were created for caching from compiler type to DIE even
if they are for the same module. The solution is to do the same as
`SymbolFileDWARF::GetUniqueDWARFASTTypeMap`: inquery
SymbolFileDWARFDebugMap first to get the shared underlying SymbolFile so
the map is shared among multiple SymbolFileDWARF.
---
 .../Plugins/SymbolFile/DWARF/DWARFASTParser.h |   2 +
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  | 397 ++++++++++--------
 .../SymbolFile/DWARF/DWARFASTParserClang.h    | 197 ++++-----
 .../SymbolFile/DWARF/DebugNamesDWARFIndex.cpp |   4 +
 .../SymbolFile/DWARF/SymbolFileDWARF.cpp      |  51 ++-
 .../SymbolFile/DWARF/SymbolFileDWARF.h        |  15 +-
 .../DWARF/SymbolFileDWARFDebugMap.h           |   9 +
 .../SymbolFile/DWARF/SymbolFileDWARFDwo.cpp   |   2 +-
 .../SymbolFile/DWARF/SymbolFileDWARFDwo.h     |   3 +-
 .../SymbolFile/DWARF/UniqueDWARFASTType.cpp   | 107 ++---
 .../SymbolFile/DWARF/UniqueDWARFASTType.h     |  36 +-
 .../delayed-definition-die-searching.test     |  36 ++
 12 files changed, 467 insertions(+), 392 deletions(-)
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
index 66db396279e063..e144cf0f9bd94e 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
@@ -60,6 +60,8 @@ class DWARFASTParser {
 
   virtual ConstString GetDIEClassTemplateParams(const DWARFDIE &die) = 0;
 
+  virtual lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) = 0;
+
   static std::optional<SymbolFile::ArrayInfo>
   ParseChildArrayInfo(const DWARFDIE &parent_die,
                       const ExecutionContext *exe_ctx = nullptr);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index f8101aba5c6277..e0b1b430b266f3 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -154,6 +154,26 @@ static bool TagIsRecordType(dw_tag_t tag) {
   }
 }
 
+static bool IsForwardDeclaration(const DWARFDIE &die,
+                                 const ParsedDWARFTypeAttributes &attrs,
+                                 LanguageType cu_language) {
+  if (attrs.is_forward_declaration)
+    return true;
+
+  // Work around an issue with clang at the moment where forward
+  // declarations for objective C classes are emitted as:
+  //  DW_TAG_structure_type [2]
+  //  DW_AT_name( "ForwardObjcClass" )
+  //  DW_AT_byte_size( 0x00 )
+  //  DW_AT_decl_file( "..." )
+  //  DW_AT_decl_line( 1 )
+  //
+  // Note that there is no DW_AT_declaration and there are no children,
+  // and the byte size is zero.
+  return attrs.byte_size && *attrs.byte_size == 0 && attrs.name &&
+         !die.HasChildren() && cu_language == eLanguageTypeObjC;
+}
+
 TypeSP DWARFASTParserClang::ParseTypeFromClangModule(const SymbolContext &sc,
                                                      const DWARFDIE &die,
                                                      Log *log) {
@@ -249,11 +269,9 @@ static void ForcefullyCompleteType(CompilerType type) {
 /// This function serves a similar purpose as RequireCompleteType above, but it
 /// avoids completing the type if it is not immediately necessary. It only
 /// ensures we _can_ complete the type later.
-static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
-                                           ClangASTImporter &ast_importer,
-                                           clang::DeclContext *decl_ctx,
-                                           DWARFDIE die,
-                                           const char *type_name_cstr) {
+void DWARFASTParserClang::PrepareContextToReceiveMembers(
+    clang::DeclContext *decl_ctx, const DWARFDIE &decl_ctx_die,
+    const DWARFDIE &die, const char *type_name_cstr) {
   auto *tag_decl_ctx = clang::dyn_cast<clang::TagDecl>(decl_ctx);
   if (!tag_decl_ctx)
     return; // Non-tag context are always ready.
@@ -268,7 +286,8 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
   // gmodules case), we can complete the type by doing a full import.
 
   // If this type was not imported from an external AST, there's nothing to do.
-  CompilerType type = ast.GetTypeForDecl(tag_decl_ctx);
+  CompilerType type = m_ast.GetTypeForDecl(tag_decl_ctx);
+  ClangASTImporter &ast_importer = GetClangASTImporter();
   if (type && ast_importer.CanImport(type)) {
     auto qual_type = ClangUtil::GetQualType(type);
     if (ast_importer.RequireCompleteType(qual_type))
@@ -279,6 +298,13 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
         type_name_cstr ? type_name_cstr : "", die.GetOffset());
   }
 
+  // By searching for the definition DIE of the decl_ctx type, we will either:
+  // 1. Found the the definition DIE and start its definition with
+  // TypeSystemClang::StartTagDeclarationDefinition.
+  // 2. Unable to find it, then need to forcefully complete it.
+  FindDefinitionTypeForDIE(decl_ctx_die);
+  if (tag_decl_ctx->isCompleteDefinition() || tag_decl_ctx->isBeingDefined())
+    return;
   // We don't have a type definition and/or the import failed. We must
   // forcefully complete the type to avoid crashes.
   ForcefullyCompleteType(type);
@@ -620,10 +646,11 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
   if (tag == DW_TAG_typedef) {
     // DeclContext will be populated when the clang type is materialized in
     // Type::ResolveCompilerType.
-    PrepareContextToReceiveMembers(
-        m_ast, GetClangASTImporter(),
-        GetClangDeclContextContainingDIE(die, nullptr), die,
-        attrs.name.GetCString());
+    DWARFDIE decl_ctx_die;
+    clang::DeclContext *decl_ctx =
+        GetClangDeclContextContainingDIE(die, &decl_ctx_die);
+    PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die,
+                                   attrs.name.GetCString());
 
     if (attrs.type.IsValid()) {
       // Try to parse a typedef from the (DWARF embedded in the) Clang
@@ -1103,32 +1130,6 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
         // struct and see if this is actually a C++ method
         Type *class_type = dwarf->ResolveType(decl_ctx_die);
         if (class_type) {
-          if (class_type->GetID() != decl_ctx_die.GetID() ||
-              IsClangModuleFwdDecl(decl_ctx_die)) {
-
-            // We uniqued the parent class of this function to another
-            // class so we now need to associate all dies under
-            // "decl_ctx_die" to DIEs in the DIE for "class_type"...
-            DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID());
-
-            if (class_type_die) {
-              std::vector<DWARFDIE> failures;
-
-              CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die,
-                                         class_type, failures);
-
-              // FIXME do something with these failures that's
-              // smarter than just dropping them on the ground.
-              // Unfortunately classes don't like having stuff added
-              // to them after their definitions are complete...
-
-              Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
-              if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
-                return type_ptr->shared_from_this();
-              }
-            }
-          }
-
           if (attrs.specification.IsValid()) {
             // We have a specification which we are going to base our
             // function prototype off of, so we need this type to be
@@ -1263,6 +1264,39 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
               }
             }
           }
+          // By here, we should have already completed the c++ class_type
+          // because if either specification or abstract_origin is present, we
+          // call GetClangDeclContextForDIE to resolve the DW_TAG_subprogram
+          // refered by this one until we reached the DW_TAG_subprogram without
+          // specification or abstract_origin (the else branch above). Then the
+          // above GetFullCompilerType() will complete the class_type if it's
+          // not completed yet. After that, we will have the mapping from DIEs
+          // in class_type_die to DeclContexts in m_die_to_decl_ctx.
+          if (class_type->GetID() != decl_ctx_die.GetID() ||
+              IsClangModuleFwdDecl(decl_ctx_die)) {
+
+            // We uniqued the parent class of this function to another
+            // class so we now need to associate all dies under
+            // "decl_ctx_die" to DIEs in the DIE for "class_type"...
+            DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID());
+
+            if (class_type_die) {
+              std::vector<DWARFDIE> failures;
+
+              CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die,
+                                         class_type, failures);
+
+              // FIXME do something with these failures that's
+              // smarter than just dropping them on the ground.
+              // Unfortunately classes don't like having stuff added
+              // to them after their definitions are complete...
+
+              Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
+              if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
+                return type_ptr->shared_from_this();
+              }
+            }
+          }
         }
       }
     }
@@ -1635,6 +1669,93 @@ DWARFASTParserClang::GetCPlusPlusQualifiedName(const DWARFDIE &die) {
   return qualified_name;
 }
 
+lldb_private::Type *
+DWARFASTParserClang::FindDefinitionTypeForDIE(const DWARFDIE &die) {
+  SymbolFileDWARF *dwarf = die.GetDWARF();
+  ParsedDWARFTypeAttributes attrs(die);
+  bool is_forward_declaration = IsForwardDeclaration(
+      die, attrs, SymbolFileDWARF::GetLanguage(*die.GetCU()));
+  if (!is_forward_declaration)
+    return dwarf->GetDIEToType()[die.GetDIE()];
+
+  const dw_tag_t tag = die.Tag();
+  TypeSP type_sp;
+  Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
+  if (log) {
+    dwarf->GetObjectFile()->GetModule()->LogMessage(
+        log,
+        "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
+        "forward declaration DIE, trying to find definition DIE",
+        static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
+        attrs.name.GetCString());
+  }
+  // We haven't parse definition die for this type, starting to search for it.
+  // After we found the definition die, the GetDeclarationDIEToDefinitionDIE()
+  // map will have the new mapping from this declaration die to definition die.
+  if (attrs.class_language == eLanguageTypeObjC ||
+      attrs.class_language == eLanguageTypeObjC_plus_plus) {
+    if (!attrs.is_complete_objc_class &&
+        die.Supports_DW_AT_APPLE_objc_complete_type()) {
+      // We have a valid eSymbolTypeObjCClass class symbol whose name
+      // matches the current objective C class that we are trying to find
+      // and this DIE isn't the complete definition (we checked
+      // is_complete_objc_class above and know it is false), so the real
+      // definition is in here somewhere
+      type_sp =
+          dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true);
+
+      if (!type_sp) {
+        SymbolFileDWARFDebugMap *debug_map_symfile =
+            dwarf->GetDebugMapSymfile();
+        if (debug_map_symfile) {
+          // We weren't able to find a full declaration in this DWARF,
+          // see if we have a declaration anywhere else...
+          type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE(
+              die, attrs.name, true);
+        }
+      }
+
+      if (type_sp && log) {
+        dwarf->GetObjectFile()->GetModule()->LogMessage(
+            log,
+            "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an "
+            "incomplete objc type, complete type is {5:x8}",
+            static_cast<void *>(this), die.GetOffset(),
+            DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
+            type_sp->GetID());
+      }
+    }
+  }
+
+  type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die);
+  if (!type_sp) {
+    SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile();
+    if (debug_map_symfile) {
+      // We weren't able to find a full declaration in this DWARF, see
+      // if we have a declaration anywhere else...
+      type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die);
+    }
+    if (type_sp && log) {
+      dwarf->GetObjectFile()->GetModule()->LogMessage(
+          log,
+          "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
+          "forward declaration, complete type is {4:x8}",
+          static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
+          attrs.name.GetCString(), type_sp->GetID());
+    }
+  }
+
+  if (!type_sp && log) {
+    dwarf->GetObjectFile()->GetModule()->LogMessage(
+        log,
+        "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
+        "forward declaration, unable to find definition DIE for it",
+        static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
+        attrs.name.GetCString());
+  }
+  return type_sp.get();
+}
+
 TypeSP
 DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
                                            const DWARFDIE &die,
@@ -1646,14 +1767,10 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
   LanguageType cu_language = SymbolFileDWARF::GetLanguage(*die.GetCU());
   Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
 
-  // UniqueDWARFASTType is large, so don't create a local variables on the
-  // stack, put it on the heap. This function is often called recursively and
-  // clang isn't good at sharing the stack space for variables in different
-  // blocks.
-  auto unique_ast_entry_up = std::make_unique<UniqueDWARFASTType>();
-
   ConstString unique_typename(attrs.name);
   Declaration unique_decl(attrs.decl);
+  uint64_t byte_size = attrs.byte_size.value_or(0);
+  attrs.is_forward_declaration = IsForwardDeclaration(die, attrs, cu_language);
 
   if (attrs.name) {
     if (Language::LanguageIsCPlusPlus(cu_language)) {
@@ -1666,14 +1783,42 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       unique_decl.Clear();
     }
 
-    if (dwarf->GetUniqueDWARFASTTypeMap().Find(
-            unique_typename, die, unique_decl, attrs.byte_size.value_or(-1),
-            *unique_ast_entry_up)) {
-      type_sp = unique_ast_entry_up->m_type_sp;
+    if (UniqueDWARFASTType *unique_ast_entry_type =
+            dwarf->GetUniqueDWARFASTTypeMap().Find(
+                unique_typename, die, unique_decl, byte_size,
+                attrs.is_forward_declaration)) {
+      type_sp = unique_ast_entry_type->m_type_sp;
       if (type_sp) {
         dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
         LinkDeclContextToDIE(
-            GetCachedClangDeclContextForDIE(unique_ast_entry_up->m_die), die);
+            GetCachedClangDeclContextForDIE(unique_ast_entry_type->m_die), die);
+        if (!attrs.is_forward_declaration) {
+          // If the DIE being parsed in this function is a definition and the
+          // entry in the map is a declaration, then we need to update the entry
+          // to point to the definition DIE.
+          if (unique_ast_entry_type->m_is_forward_declaration) {
+            unique_ast_entry_type->m_die = die;
+            unique_ast_entry_type->m_byte_size = byte_size;
+            unique_ast_entry_type->m_declaration = unique_decl;
+            unique_ast_entry_type->m_is_forward_declaration = false;
+            // Need to update Type ID to refer to the definition DIE. because
+            // it's used in ParseSubroutine to determine if we need to copy cxx
+            // method types from a declaration DIE to this definition DIE.
+            type_sp->SetID(die.GetID());
+            clang_type = type_sp->GetForwardCompilerType();
+            if (attrs.class_language != eLanguageTypeObjC &&
+                attrs.class_language != eLanguageTypeObjC_plus_plus)
+              TypeSystemClang::StartTagDeclarationDefinition(clang_type);
+
+            CompilerType compiler_type_no_qualifiers =
+                ClangUtil::RemoveFastQualifiers(clang_type);
+            auto result = dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
+                compiler_type_no_qualifiers.GetOpaqueQualType(),
+                *die.GetDIERef());
+            if (!result.second)
+              result.first->second = *die.GetDIERef();
+          }
+        }
         return type_sp;
       }
     }
@@ -1695,125 +1840,21 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
     default_accessibility = eAccessPrivate;
   }
 
-  if (attrs.byte_size && *attrs.byte_size == 0 && attrs.name &&
-      !die.HasChildren() && cu_language == eLanguageTypeObjC) {
-    // Work around an issue with clang at the moment where forward
-    // declarations for objective C classes are emitted as:
-    //  DW_TAG_structure_type [2]
-    //  DW_AT_name( "ForwardObjcClass" )
-    //  DW_AT_byte_size( 0x00 )
-    //  DW_AT_decl_file( "..." )
-    //  DW_AT_decl_line( 1 )
-    //
-    // Note that there is no DW_AT_declaration and there are no children,
-    // and the byte size is zero.
-    attrs.is_forward_declaration = true;
-  }
-
-  if (attrs.class_language == eLanguageTypeObjC ||
-      attrs.class_language == eLanguageTypeObjC_plus_plus) {
-    if (!attrs.is_complete_objc_class &&
-        die.Supports_DW_AT_APPLE_objc_complete_type()) {
-      // We have a valid eSymbolTypeObjCClass class symbol whose name
-      // matches the current objective C class that we are trying to find
-      // and this DIE isn't the complete definition (we checked
-      // is_complete_objc_class above and know it is false), so the real
-      // definition is in here somewhere
-      type_sp =
-          dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true);
-
-      if (!type_sp) {
-        SymbolFileDWARFDebugMap *debug_map_symfile =
-            dwarf->GetDebugMapSymfile();
-        if (debug_map_symfile) {
-          // We weren't able to find a full declaration in this DWARF,
-          // see if we have a declaration anywhere else...
-          type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE(
-              die, attrs.name, true);
-        }
-      }
-
-      if (type_sp) {
-        if (log) {
-          dwarf->GetObjectFile()->GetModule()->LogMessage(
-              log,
-              "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an "
-              "incomplete objc type, complete type is {5:x8}",
-              static_cast<void *>(this), die.GetOffset(),
-              DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
-              type_sp->GetID());
-        }
-
-        // We found a real definition for this type elsewhere so lets use
-        // it and cache the fact that we found a complete type for this
-        // die
-        dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-        return type_sp;
-      }
-    }
-  }
-
   if (attrs.is_forward_declaration) {
-    // We have a forward declaration to a type and we need to try and
-    // find a full declaration. We look in the current type index just in
-    // case we have a forward declaration followed by an actual
-    // declarations in the DWARF. If this fails, we need to look
-    // elsewhere...
-    if (log) {
-      dwarf->GetObjectFile()->GetModule()->LogMessage(
-          log,
-          "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a "
-          "forward declaration, trying to find complete type",
-          static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
-          tag, attrs.name.GetCString());
-    }
-
     // See if the type comes from a Clang module and if so, track down
     // that type.
     type_sp = ParseTypeFromClangModule(sc, die, log);
     if (type_sp)
       return type_sp;
-
-    // type_sp = FindDefinitionTypeForDIE (dwarf_cu, die,
-    // type_name_const_str);
-    type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die);
-
-    if (!type_sp) {
-      SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile();
-      if (debug_map_symfile) {
-        // We weren't able to find a full declaration in this DWARF, see
-        // if we have a declaration anywhere else...
-        type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die);
-      }
-    }
-
-    if (type_sp) {
-      if (log) {
-        dwarf->GetObjectFile()->GetModule()->LogMessage(
-            log,
-            "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a "
-            "forward declaration, complete type is {5:x8}",
-            static_cast<void *>(this), die.GetOffset(),
-            DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
-            type_sp->GetID());
-      }
-
-      // We found a real definition for this type elsewhere so lets use
-      // it and cache the fact that we found a complete type for this die
-      dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-      clang::DeclContext *defn_decl_ctx =
-          GetCachedClangDeclContextForDIE(dwarf->GetDIE(type_sp->GetID()));
-      if (defn_decl_ctx)
-        LinkDeclContextToDIE(defn_decl_ctx, die);
-      return type_sp;
-    }
   }
+
   assert(tag_decl_kind != -1);
   UNUSED_IF_ASSERT_DISABLED(tag_decl_kind);
-  bool clang_type_was_created = false;
-  clang::DeclContext *decl_ctx = GetClangDeclContextContainingDIE(die, nullptr);
+  DWARFDIE decl_ctx_die;
+  clang::DeclContext *decl_ctx =
+      GetClangDeclContextContainingDIE(die, &decl_ctx_die);
 
-  PrepareContextToReceiveMembers(m_ast, GetClangASTImporter(), decl_ctx, die,
+  PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die,
                                  attrs.name.GetCString());
 
   if (attrs.accessibility == eAccessNone && decl_ctx) {
@@ -1852,20 +1893,17 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
             tag_decl_kind, template_param_infos);
     clang_type =
         m_ast.CreateClassTemplateSpecializationType(class_specialization_decl);
-    clang_type_was_created = true;
 
     m_ast.SetMetadata(class_template_decl, metadata);
     m_ast.SetMetadata(class_specialization_decl, metadata);
   }
 
-  if (!clang_type_was_created) {
-    clang_type_was_created = true;
+  if (!clang_type) {
     clang_type = m_ast.CreateRecordType(
         decl_ctx, GetOwningClangModule(die), attrs.accessibility,
         attrs.name.GetCString(), tag_decl_kind, attrs.class_language, &metadata,
         attrs.exports_symbols);
   }
-
   // Store a forward declaration to this class type in case any
   // parameters in any class methods need it for the clang types for
   // function prototypes.
@@ -1876,13 +1914,19 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       Type::ResolveState::Forward,
       TypePayloadClang(OptionalClangModuleID(), attrs.is_complete_objc_class));
 
+  // UniqueDWARFASTType is large, so don't create a local variables on the
+  // stack, put it on the heap. This function is often called recursively and
+  // clang isn't good at sharing the stack space for variables in different
+  // blocks.
+  auto unique_ast_entry_up = std::make_unique<UniqueDWARFASTType>();
   // Add our type to the unique type map so we don't end up creating many
   // copies of the same type over and over in the ASTContext for our
   // module
   unique_ast_entry_up->m_type_sp = type_sp;
   unique_ast_entry_up->m_die = die;
   unique_ast_entry_up->m_declaration = unique_decl;
-  unique_ast_entry_up->m_byte_size = attrs.byte_size.value_or(0);
+  unique_ast_entry_up->m_byte_size = byte_size;
+  unique_ast_entry_up->m_is_forward_declaration = attrs.is_forward_declaration;
   dwarf->GetUniqueDWARFASTTypeMap().Insert(unique_typename,
                                            *unique_ast_entry_up);
 
@@ -1923,7 +1967,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
           GetClangASTImporter().SetRecordLayout(record_decl, layout);
         }
       }
-    } else if (clang_type_was_created) {
+    } else {
       // Start the definition if the class is not objective C since the
       // underlying decls respond to isCompleteDefinition(). Objective
       // C decls don't respond to isCompleteDefinition() so we can't
@@ -1935,26 +1979,21 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       if (attrs.class_language != eLanguageTypeObjC &&
           attrs.class_language != eLanguageTypeObjC_plus_plus)
         TypeSystemClang::StartTagDeclarationDefinition(clang_type);
-
-      // Leave this as a forward declaration until we need to know the
-      // details of the type. lldb_private::Type will automatically call
-      // the SymbolFile virtual function
-      // "SymbolFileDWARF::CompleteType(Type *)" When the definition
-      // needs to be defined.
-      assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count(
-                 ClangUtil::RemoveFastQualifiers(clang_type)
-                     .GetOpaqueQualType()) &&
-             "Type already in the forward declaration map!");
-      // Can't assume m_ast.GetSymbolFile() is actually a
-      // SymbolFileDWARF, it can be a SymbolFileDWARFDebugMap for Apple
-      // binaries.
-      dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
-          ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(),
-          *die.GetDIERef());
-      m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true);
     }
   }
 
+  // If this is a declaration DIE, leave this as a forward declaration until we
+  // need to know the details of the type. lldb_private::Type will automatically
+  // call the SymbolFile virtual function "SymbolFileDWARF::CompleteType(Type
+  // *)" When the definition needs to be defined.
+  assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count(
+             ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType()) &&
+         "Type already in the forward declaration map!");
+  dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
+      ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(),
+      *die.GetDIERef());
+  m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true);
+
   // If we made a clang type, set the trivial abi if applicable: We only
   // do this for pass by value - which implies the Trivial ABI. There
   // isn't a way to assert that something that would normally be pass by
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index 8d4af203bb2871..853b8ccc30369f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -42,40 +42,40 @@ struct ParsedDWARFTypeAttributes;
 
 class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
 public:
+  typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE;
+
   DWARFASTParserClang(lldb_private::TypeSystemClang &ast);
 
   ~DWARFASTParserClang() override;
 
   // DWARFASTParser interface.
-  lldb::TypeSP
-  ParseTypeFromDWARF(const lldb_private::SymbolContext &sc,
-                     const lldb_private::plugin::dwarf::DWARFDIE &die,
-                     bool *type_is_new_ptr) override;
+  lldb::TypeSP ParseTypeFromDWARF(const lldb_private::SymbolContext &sc,
+                                  const DWARFDIE &die,
+                                  bool *type_is_new_ptr) override;
 
-  lldb_private::ConstString ConstructDemangledNameFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::ConstString
+  ConstructDemangledNameFromDWARF(const DWARFDIE &die) override;
 
   lldb_private::Function *
   ParseFunctionFromDWARF(lldb_private::CompileUnit &comp_unit,
-                         const lldb_private::plugin::dwarf::DWARFDIE &die,
+                         const DWARFDIE &die,
                          const lldb_private::AddressRange &func_range) override;
 
   bool
-  CompleteTypeFromDWARF(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                        lldb_private::Type *type,
+  CompleteTypeFromDWARF(const DWARFDIE &die, lldb_private::Type *type,
                         lldb_private::CompilerType &compiler_type) override;
 
-  lldb_private::CompilerDecl GetDeclForUIDFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::CompilerDecl
+  GetDeclForUIDFromDWARF(const DWARFDIE &die) override;
 
   void EnsureAllDIEsInDeclContextHaveBeenParsed(
       lldb_private::CompilerDeclContext decl_context) override;
 
-  lldb_private::CompilerDeclContext GetDeclContextForUIDFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::CompilerDeclContext
+  GetDeclContextForUIDFromDWARF(const DWARFDIE &die) override;
 
-  lldb_private::CompilerDeclContext GetDeclContextContainingUIDFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::CompilerDeclContext
+  GetDeclContextContainingUIDFromDWARF(const DWARFDIE &die) override;
 
   lldb_private::ClangASTImporter &GetClangASTImporter();
 
@@ -105,8 +105,13 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// \return A string, including surrounding '<>', of the template parameters.
   /// If the DIE's name already has '<>', returns an empty ConstString because
   /// it's assumed that the caller is using the DIE name anyway.
-  lldb_private::ConstString GetDIEClassTemplateParams(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::ConstString
+  GetDIEClassTemplateParams(const DWARFDIE &die) override;
+
+  // Searching for definition DIE for the given DIE and return the type
+  // associated with the definition DIE, or nullptr if definition DIE is not
+  // found.
+  lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) override;
 
 protected:
   /// Protected typedefs and members.
@@ -118,8 +123,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
       const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *,
       clang::DeclContext *>
       DIEToDeclContextMap;
-  typedef std::multimap<const clang::DeclContext *,
-                        const lldb_private::plugin::dwarf::DWARFDIE>
+  typedef std::multimap<const clang::DeclContext *, const DWARFDIE>
       DeclContextToDIEMap;
   typedef llvm::DenseMap<
       const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *,
@@ -137,14 +141,11 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   std::unique_ptr<lldb_private::ClangASTImporter> m_clang_ast_importer_up;
   /// @}
 
-  clang::DeclContext *
-  GetDeclContextForBlock(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetDeclContextForBlock(const DWARFDIE &die);
 
-  clang::BlockDecl *
-  ResolveBlockDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::BlockDecl *ResolveBlockDIE(const DWARFDIE &die);
 
-  clang::NamespaceDecl *
-  ResolveNamespaceDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::NamespaceDecl *ResolveNamespaceDIE(const DWARFDIE &die);
 
   /// Returns the namespace decl that a DW_TAG_imported_declaration imports.
   ///
@@ -155,96 +156,86 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   ///          'die' imports. If the imported entity is not a namespace
   ///          or another import declaration, returns nullptr. If an error
   ///          occurs, returns nullptr.
-  clang::NamespaceDecl *ResolveImportedDeclarationDIE(
-      const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::NamespaceDecl *ResolveImportedDeclarationDIE(const DWARFDIE &die);
 
-  bool ParseTemplateDIE(const lldb_private::plugin::dwarf::DWARFDIE &die,
+  bool ParseTemplateDIE(const DWARFDIE &die,
                         lldb_private::TypeSystemClang::TemplateParameterInfos
                             &template_param_infos);
 
   bool ParseTemplateParameterInfos(
-      const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+      const DWARFDIE &parent_die,
       lldb_private::TypeSystemClang::TemplateParameterInfos
           &template_param_infos);
 
-  std::string
-  GetCPlusPlusQualifiedName(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  std::string GetCPlusPlusQualifiedName(const DWARFDIE &die);
 
   bool ParseChildMembers(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      lldb_private::CompilerType &class_compiler_type,
+      const DWARFDIE &die, lldb_private::CompilerType &class_compiler_type,
       std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> &base_classes,
-      std::vector<lldb_private::plugin::dwarf::DWARFDIE> &member_function_dies,
-      std::vector<lldb_private::plugin::dwarf::DWARFDIE> &contained_type_dies,
+      std::vector<DWARFDIE> &member_function_dies,
+      std::vector<DWARFDIE> &contained_type_dies,
       DelayedPropertyList &delayed_properties,
       const lldb::AccessType default_accessibility,
       lldb_private::ClangASTImporter::LayoutInfo &layout_info);
 
   size_t
   ParseChildParameters(clang::DeclContext *containing_decl_ctx,
-                       const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-                       bool skip_artificial, bool &is_static, bool &is_variadic,
+                       const DWARFDIE &parent_die, bool skip_artificial,
+                       bool &is_static, bool &is_variadic,
                        bool &has_template_params,
                        std::vector<lldb_private::CompilerType> &function_args,
                        std::vector<clang::ParmVarDecl *> &function_param_decls,
                        unsigned &type_quals);
 
-  size_t ParseChildEnumerators(
-      lldb_private::CompilerType &compiler_type, bool is_signed,
-      uint32_t enumerator_byte_size,
-      const lldb_private::plugin::dwarf::DWARFDIE &parent_die);
+  size_t ParseChildEnumerators(lldb_private::CompilerType &compiler_type,
+                               bool is_signed, uint32_t enumerator_byte_size,
+                               const DWARFDIE &parent_die);
 
   /// Parse a structure, class, or union type DIE.
-  lldb::TypeSP
-  ParseStructureLikeDIE(const lldb_private::SymbolContext &sc,
-                        const lldb_private::plugin::dwarf::DWARFDIE &die,
-                        ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseStructureLikeDIE(const lldb_private::SymbolContext &sc,
+                                     const DWARFDIE &die,
+                                     ParsedDWARFTypeAttributes &attrs);
 
-  clang::Decl *
-  GetClangDeclForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::Decl *GetClangDeclForDIE(const DWARFDIE &die);
 
-  clang::DeclContext *
-  GetClangDeclContextForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetClangDeclContextForDIE(const DWARFDIE &die);
 
-  clang::DeclContext *GetClangDeclContextContainingDIE(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      lldb_private::plugin::dwarf::DWARFDIE *decl_ctx_die);
-  lldb_private::OptionalClangModuleID
-  GetOwningClangModule(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetClangDeclContextContainingDIE(const DWARFDIE &die,
+                                                       DWARFDIE *decl_ctx_die);
+  lldb_private::OptionalClangModuleID GetOwningClangModule(const DWARFDIE &die);
 
-  bool CopyUniqueClassMethodTypes(
-      const lldb_private::plugin::dwarf::DWARFDIE &src_class_die,
-      const lldb_private::plugin::dwarf::DWARFDIE &dst_class_die,
-      lldb_private::Type *class_type,
-      std::vector<lldb_private::plugin::dwarf::DWARFDIE> &failures);
+  bool CopyUniqueClassMethodTypes(const DWARFDIE &src_class_die,
+                                  const DWARFDIE &dst_class_die,
+                                  lldb_private::Type *class_type,
+                                  std::vector<DWARFDIE> &failures);
 
-  clang::DeclContext *GetCachedClangDeclContextForDIE(
-      const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetCachedClangDeclContextForDIE(const DWARFDIE &die);
 
-  void LinkDeclContextToDIE(clang::DeclContext *decl_ctx,
-                            const lldb_private::plugin::dwarf::DWARFDIE &die);
+  void LinkDeclContextToDIE(clang::DeclContext *decl_ctx, const DWARFDIE &die);
 
-  void LinkDeclToDIE(clang::Decl *decl,
-                     const lldb_private::plugin::dwarf::DWARFDIE &die);
+  void LinkDeclToDIE(clang::Decl *decl, const DWARFDIE &die);
 
   /// If \p type_sp is valid, calculate and set its symbol context scope, and
   /// update the type list for its backing symbol file.
   ///
   /// Returns \p type_sp.
-  lldb::TypeSP UpdateSymbolContextScopeForType(
-      const lldb_private::SymbolContext &sc,
-      const lldb_private::plugin::dwarf::DWARFDIE &die, lldb::TypeSP type_sp);
+  lldb::TypeSP
+  UpdateSymbolContextScopeForType(const lldb_private::SymbolContext &sc,
+                                  const DWARFDIE &die, lldb::TypeSP type_sp);
 
   /// Follow Clang Module Skeleton CU references to find a type definition.
-  lldb::TypeSP
-  ParseTypeFromClangModule(const lldb_private::SymbolContext &sc,
-                           const lldb_private::plugin::dwarf::DWARFDIE &die,
-                           lldb_private::Log *log);
+  lldb::TypeSP ParseTypeFromClangModule(const lldb_private::SymbolContext &sc,
+                                        const DWARFDIE &die,
+                                        lldb_private::Log *log);
 
   // Return true if this type is a declaration to a type in an external
   // module.
-  lldb::ModuleSP
-  GetModuleForType(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  lldb::ModuleSP GetModuleForType(const DWARFDIE &die);
+
+  void PrepareContextToReceiveMembers(clang::DeclContext *decl_ctx,
+                                      const DWARFDIE &decl_ctx_die,
+                                      const DWARFDIE &die,
+                                      const char *type_name_cstr);
 
   static bool classof(const DWARFASTParser *Parser) {
     return Parser->GetKind() == Kind::DWARFASTParserClang;
@@ -274,10 +265,8 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
 
   /// Parsed form of all attributes that are relevant for parsing type members.
   struct MemberAttributes {
-    explicit MemberAttributes(
-        const lldb_private::plugin::dwarf::DWARFDIE &die,
-        const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-        lldb::ModuleSP module_sp);
+    explicit MemberAttributes(const DWARFDIE &die, const DWARFDIE &parent_die,
+                              lldb::ModuleSP module_sp);
     const char *name = nullptr;
     /// Indicates how many bits into the word (according to the host endianness)
     /// the low-order bit of the field starts. Can be negative.
@@ -324,15 +313,12 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// created property.
   /// \param delayed_properties The list of delayed properties that the result
   /// will be appended to.
-  void
-  ParseObjCProperty(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                    const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-                    const lldb_private::CompilerType &class_clang_type,
-                    DelayedPropertyList &delayed_properties);
+  void ParseObjCProperty(const DWARFDIE &die, const DWARFDIE &parent_die,
+                         const lldb_private::CompilerType &class_clang_type,
+                         DelayedPropertyList &delayed_properties);
 
   void
-  ParseSingleMember(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                    const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+  ParseSingleMember(const DWARFDIE &die, const DWARFDIE &parent_die,
                     const lldb_private::CompilerType &class_clang_type,
                     lldb::AccessType default_accessibility,
                     lldb_private::ClangASTImporter::LayoutInfo &layout_info,
@@ -350,31 +336,25 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// \param[in] class_clang_type The parent RecordType of the static
   ///                             member this function will create.
   void CreateStaticMemberVariable(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      const MemberAttributes &attrs,
+      const DWARFDIE &die, const MemberAttributes &attrs,
       const lldb_private::CompilerType &class_clang_type);
 
-  bool CompleteRecordType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                          lldb_private::Type *type,
+  bool CompleteRecordType(const DWARFDIE &die, lldb_private::Type *type,
                           lldb_private::CompilerType &clang_type);
-  bool CompleteEnumType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                        lldb_private::Type *type,
+  bool CompleteEnumType(const DWARFDIE &die, lldb_private::Type *type,
                         lldb_private::CompilerType &clang_type);
 
-  lldb::TypeSP
-  ParseTypeModifier(const lldb_private::SymbolContext &sc,
-                    const lldb_private::plugin::dwarf::DWARFDIE &die,
-                    ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseTypeModifier(const lldb_private::SymbolContext &sc,
+                                 const DWARFDIE &die,
+                                 ParsedDWARFTypeAttributes &attrs);
   lldb::TypeSP ParseEnum(const lldb_private::SymbolContext &sc,
-                         const lldb_private::plugin::dwarf::DWARFDIE &die,
-                         ParsedDWARFTypeAttributes &attrs);
-  lldb::TypeSP ParseSubroutine(const lldb_private::plugin::dwarf::DWARFDIE &die,
+                         const DWARFDIE &die, ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseSubroutine(const DWARFDIE &die,
                                const ParsedDWARFTypeAttributes &attrs);
-  lldb::TypeSP ParseArrayType(const lldb_private::plugin::dwarf::DWARFDIE &die,
+  lldb::TypeSP ParseArrayType(const DWARFDIE &die,
                               const ParsedDWARFTypeAttributes &attrs);
-  lldb::TypeSP
-  ParsePointerToMemberType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                           const ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParsePointerToMemberType(const DWARFDIE &die,
+                                        const ParsedDWARFTypeAttributes &attrs);
 
   /// Parses a DW_TAG_inheritance DIE into a base/super class.
   ///
@@ -391,8 +371,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// \param layout_info The layout information that will be updated for C++
   /// base classes with the base offset.
   void ParseInheritance(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+      const DWARFDIE &die, const DWARFDIE &parent_die,
       const lldb_private::CompilerType class_clang_type,
       const lldb::AccessType default_accessibility,
       const lldb::ModuleSP &module_sp,
@@ -409,8 +388,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// \param layout_info The layout information that will be updated for
   //   base classes with the base offset
   void
-  ParseRustVariantPart(lldb_private::plugin::dwarf::DWARFDIE &die,
-                       const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+  ParseRustVariantPart(DWARFDIE &die, const DWARFDIE &parent_die,
                        lldb_private::CompilerType &class_clang_type,
                        const lldb::AccessType default_accesibility,
                        lldb_private::ClangASTImporter::LayoutInfo &layout_info);
@@ -420,8 +398,9 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
 /// Some attributes are relevant for all kinds of types (declaration), while
 /// others are only meaningful to a specific type (is_virtual)
 struct ParsedDWARFTypeAttributes {
-  explicit ParsedDWARFTypeAttributes(
-      const lldb_private::plugin::dwarf::DWARFDIE &die);
+  typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE;
+
+  explicit ParsedDWARFTypeAttributes(const DWARFDIE &die);
 
   lldb::AccessType accessibility = lldb::eAccessNone;
   bool is_artificial = false;
@@ -438,7 +417,7 @@ struct ParsedDWARFTypeAttributes {
   const char *mangled_name = nullptr;
   lldb_private::ConstString name;
   lldb_private::Declaration decl;
-  lldb_private::plugin::dwarf::DWARFDIE object_pointer;
+  DWARFDIE object_pointer;
   lldb_private::plugin::dwarf::DWARFFormValue abstract_origin;
   lldb_private::plugin::dwarf::DWARFFormValue containing_type;
   lldb_private::plugin::dwarf::DWARFFormValue signature;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
index 79400e36e04f3f..c98e5481609dea 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
@@ -87,6 +87,10 @@ bool DebugNamesDWARFIndex::ProcessEntry(
   DWARFDIE die = dwarf.GetDIE(*ref);
   if (!die)
     return true;
+  // Clang erroneously emits index entries for declaration DIEs in case when the
+  // definition is in a type unit (llvm.org/pr77696). Weed those out.
+  if (die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0))
+    return true;
   return callback(die);
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index f6f152726bf74e..bc489e5b8ad465 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -481,6 +481,13 @@ static ConstString GetDWARFMachOSegmentName() {
   return g_dwarf_section_name;
 }
 
+llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
+SymbolFileDWARF::GetForwardDeclCompilerTypeToDIE() {
+  if (SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile())
+    return debug_map_symfile->GetForwardDeclCompilerTypeToDIE();
+  return m_forward_decl_compiler_type_to_die;
+}
+
 UniqueDWARFASTTypeMap &SymbolFileDWARF::GetUniqueDWARFASTTypeMap() {
   SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile();
   if (debug_map_symfile)
@@ -1632,27 +1639,33 @@ bool SymbolFileDWARF::CompleteType(CompilerType &compiler_type) {
     return true;
   }
 
-  DWARFDIE dwarf_die = GetDIE(die_it->getSecond());
-  if (dwarf_die) {
-    // Once we start resolving this type, remove it from the forward
-    // declaration map in case anyone child members or other types require this
-    // type to get resolved. The type will get resolved when all of the calls
-    // to SymbolFileDWARF::ResolveClangOpaqueTypeDefinition are done.
-    GetForwardDeclCompilerTypeToDIE().erase(die_it);
-
-    Type *type = GetDIEToType().lookup(dwarf_die.GetDIE());
+  // Once we start resolving this type, remove it from the forward
+  // declaration map in case anyone's child members or other types require this
+  // type to get resolved.
+  DWARFDIE dwarf_die = GetDIE(die_it->second);
+  GetForwardDeclCompilerTypeToDIE().erase(die_it);
+  Type *type = nullptr;
+  if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU()))
+    type = dwarf_ast->FindDefinitionTypeForDIE(dwarf_die);
+  if (!type)
+    return false;
 
-    Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion);
-    if (log)
-      GetObjectFile()->GetModule()->LogMessageVerboseBacktrace(
-          log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...",
-          dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()),
-          dwarf_die.Tag(), type->GetName().AsCString());
-    assert(compiler_type);
-    if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU()))
-      return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type);
+  die_it = GetForwardDeclCompilerTypeToDIE().find(
+      compiler_type_no_qualifiers.GetOpaqueQualType());
+  if (die_it != GetForwardDeclCompilerTypeToDIE().end()) {
+    dwarf_die = GetDIE(die_it->getSecond());
+    GetForwardDeclCompilerTypeToDIE().erase(die_it);
   }
-  return false;
+
+  if (Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion))
+    GetObjectFile()->GetModule()->LogMessageVerboseBacktrace(
+        log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...",
+        dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()),
+        dwarf_die.Tag(), type->GetName().AsCString());
+  assert(compiler_type);
+  if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU()))
+    return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type);
+  return true;
 }
 
 Type *SymbolFileDWARF::ResolveType(const DWARFDIE &die,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 7282c08c6857c9..35893f2072dd64 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -335,12 +335,8 @@ class SymbolFileDWARF : public SymbolFileCommon {
 
   virtual DIEToTypePtr &GetDIEToType() { return m_die_to_type; }
 
-  typedef llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef>
-      CompilerTypeToDIE;
-
-  virtual CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() {
-    return m_forward_decl_compiler_type_to_die;
-  }
+  virtual llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
+  GetForwardDeclCompilerTypeToDIE();
 
   typedef llvm::DenseMap<const DWARFDebugInfoEntry *, lldb::VariableSP>
       DIEToVariableSP;
@@ -533,9 +529,14 @@ class SymbolFileDWARF : public SymbolFileCommon {
   NameToOffsetMap m_function_scope_qualified_name_map;
   std::unique_ptr<DWARFDebugRanges> m_ranges;
   UniqueDWARFASTTypeMap m_unique_ast_type_map;
+  // A map from DIE to lldb_private::Type. For record type, the key might be
+  // either declaration DIE or definition DIE.
   DIEToTypePtr m_die_to_type;
   DIEToVariableSP m_die_to_variable_sp;
-  CompilerTypeToDIE m_forward_decl_compiler_type_to_die;
+  // A map from CompilerType to the struct/class/union/enum DIE (might be a
+  // declaration or a definition) that is used to construct it.
+  llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef>
+      m_forward_decl_compiler_type_to_die;
   llvm::DenseMap<dw_offset_t, std::unique_ptr<SupportFileList>>
       m_type_unit_support_files;
   std::vector<uint32_t> m_lldb_cu_to_dwarf_unit;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
index de22dd676eef0a..d7d571919bc7d6 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
@@ -284,6 +284,11 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon {
   lldb::TypeSP FindCompleteObjCDefinitionTypeForDIE(
       const DWARFDIE &die, ConstString type_name, bool must_be_implementation);
 
+  llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
+  GetForwardDeclCompilerTypeToDIE() {
+    return m_forward_decl_compiler_type_to_die;
+  }
+
   UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() {
     return m_unique_ast_type_map;
   }
@@ -321,6 +326,10 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon {
   std::vector<uint32_t> m_func_indexes; // Sorted by address
   std::vector<uint32_t> m_glob_indexes;
   std::map<std::pair<ConstString, llvm::sys::TimePoint<>>, OSOInfoSP> m_oso_map;
+  // A map from CompilerType to the struct/class/union/enum DIE (might be a
+  // declaration or a definition) that is used to construct it.
+  llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef>
+      m_forward_decl_compiler_type_to_die;
   UniqueDWARFASTTypeMap m_unique_ast_type_map;
   LazyBool m_supports_DW_AT_APPLE_objc_complete_type;
   DebugMap m_debug_map;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
index 85e1afd0d89761..8fd369c65f86b6 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
@@ -110,7 +110,7 @@ SymbolFileDWARF::DIEToVariableSP &SymbolFileDWARFDwo::GetDIEToVariable() {
   return GetBaseSymbolFile().GetDIEToVariable();
 }
 
-SymbolFileDWARF::CompilerTypeToDIE &
+llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
 SymbolFileDWARFDwo::GetForwardDeclCompilerTypeToDIE() {
   return GetBaseSymbolFile().GetForwardDeclCompilerTypeToDIE();
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
index 1500540424b524..2f0ac415e90d40 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
@@ -72,7 +72,8 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF {
 
   DIEToVariableSP &GetDIEToVariable() override;
 
-  CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() override;
+  llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
+  GetForwardDeclCompilerTypeToDIE() override;
 
   UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() override;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
index 223518f0ae8241..4762356034cab7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
@@ -13,66 +13,67 @@
 using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
 
-bool UniqueDWARFASTTypeList::Find(const DWARFDIE &die,
-                                  const lldb_private::Declaration &decl,
-                                  const int32_t byte_size,
-                                  UniqueDWARFASTType &entry) const {
-  for (const UniqueDWARFASTType &udt : m_collection) {
+UniqueDWARFASTType *UniqueDWARFASTTypeList::Find(
+    const DWARFDIE &die, const lldb_private::Declaration &decl,
+    const int32_t byte_size, bool is_forward_declaration) {
+  for (UniqueDWARFASTType &udt : m_collection) {
     // Make sure the tags match
     if (udt.m_die.Tag() == die.Tag()) {
-      // Validate byte sizes of both types only if both are valid.
-      if (udt.m_byte_size < 0 || byte_size < 0 ||
-          udt.m_byte_size == byte_size) {
-        // Make sure the file and line match
-        if (udt.m_declaration == decl) {
-          // The type has the same name, and was defined on the same file and
-          // line. Now verify all of the parent DIEs match.
-          DWARFDIE parent_arg_die = die.GetParent();
-          DWARFDIE parent_pos_die = udt.m_die.GetParent();
-          bool match = true;
-          bool done = false;
-          while (!done && match && parent_arg_die && parent_pos_die) {
-            const dw_tag_t parent_arg_tag = parent_arg_die.Tag();
-            const dw_tag_t parent_pos_tag = parent_pos_die.Tag();
-            if (parent_arg_tag == parent_pos_tag) {
-              switch (parent_arg_tag) {
-              case DW_TAG_class_type:
-              case DW_TAG_structure_type:
-              case DW_TAG_union_type:
-              case DW_TAG_namespace: {
-                const char *parent_arg_die_name = parent_arg_die.GetName();
-                if (parent_arg_die_name ==
-                    nullptr) // Anonymous (i.e. no-name) struct
-                {
-                  match = false;
-                } else {
-                  const char *parent_pos_die_name = parent_pos_die.GetName();
-                  if (parent_pos_die_name == nullptr ||
-                      ((parent_arg_die_name != parent_pos_die_name) &&
-                       strcmp(parent_arg_die_name, parent_pos_die_name)))
-                    match = false;
-                }
-              } break;
-
-              case DW_TAG_compile_unit:
-              case DW_TAG_partial_unit:
-                done = true;
-                break;
-              default:
-                break;
-              }
+      // If they are not both definition DIEs or both declaration DIEs, then
+      // don't check for byte size and declaration location, because declaration
+      // DIEs usually don't have those info.
+      bool matching_size_declaration =
+          udt.m_is_forward_declaration != is_forward_declaration
+              ? true
+              : (udt.m_byte_size < 0 || byte_size < 0 ||
+                 udt.m_byte_size == byte_size) &&
+                    udt.m_declaration == decl;
+      if (!matching_size_declaration)
+        continue;
+      // The type has the same name, and was defined on the same file and
+      // line. Now verify all of the parent DIEs match.
+      DWARFDIE parent_arg_die = die.GetParent();
+      DWARFDIE parent_pos_die = udt.m_die.GetParent();
+      bool match = true;
+      bool done = false;
+      while (!done && match && parent_arg_die && parent_pos_die) {
+        const dw_tag_t parent_arg_tag = parent_arg_die.Tag();
+        const dw_tag_t parent_pos_tag = parent_pos_die.Tag();
+        if (parent_arg_tag == parent_pos_tag) {
+          switch (parent_arg_tag) {
+          case DW_TAG_class_type:
+          case DW_TAG_structure_type:
+          case DW_TAG_union_type:
+          case DW_TAG_namespace: {
+            const char *parent_arg_die_name = parent_arg_die.GetName();
+            if (parent_arg_die_name == nullptr) {
+              // Anonymous (i.e. no-name) struct
+              match = false;
+            } else {
+              const char *parent_pos_die_name = parent_pos_die.GetName();
+              if (parent_pos_die_name == nullptr ||
+                  ((parent_arg_die_name != parent_pos_die_name) &&
+                   strcmp(parent_arg_die_name, parent_pos_die_name)))
+                match = false;
             }
-            parent_arg_die = parent_arg_die.GetParent();
-            parent_pos_die = parent_pos_die.GetParent();
-          }
+          } break;
 
-          if (match) {
-            entry = udt;
-            return true;
+          case DW_TAG_compile_unit:
+          case DW_TAG_partial_unit:
+            done = true;
+            break;
+          default:
+            break;
           }
         }
+        parent_arg_die = parent_arg_die.GetParent();
+        parent_pos_die = parent_pos_die.GetParent();
+      }
+
+      if (match) {
+        return &udt;
       }
     }
   }
-  return false;
+  return nullptr;
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h
index bf3cbae55e5c7b..29e5c02dcbe176 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h
@@ -23,31 +23,19 @@ class UniqueDWARFASTType {
   // Constructors and Destructors
   UniqueDWARFASTType() : m_type_sp(), m_die(), m_declaration() {}
 
-  UniqueDWARFASTType(lldb::TypeSP &type_sp, const DWARFDIE &die,
-                     const Declaration &decl, int32_t byte_size)
-      : m_type_sp(type_sp), m_die(die), m_declaration(decl),
-        m_byte_size(byte_size) {}
-
   UniqueDWARFASTType(const UniqueDWARFASTType &rhs)
       : m_type_sp(rhs.m_type_sp), m_die(rhs.m_die),
-        m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size) {}
+        m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size),
+        m_is_forward_declaration(rhs.m_is_forward_declaration) {}
 
   ~UniqueDWARFASTType() = default;
 
-  UniqueDWARFASTType &operator=(const UniqueDWARFASTType &rhs) {
-    if (this != &rhs) {
-      m_type_sp = rhs.m_type_sp;
-      m_die = rhs.m_die;
-      m_declaration = rhs.m_declaration;
-      m_byte_size = rhs.m_byte_size;
-    }
-    return *this;
-  }
-
   lldb::TypeSP m_type_sp;
   DWARFDIE m_die;
   Declaration m_declaration;
   int32_t m_byte_size = -1;
+  // True if the m_die is a forward declaration DIE.
+  bool m_is_forward_declaration = true;
 };
 
 class UniqueDWARFASTTypeList {
@@ -62,8 +50,9 @@ class UniqueDWARFASTTypeList {
     m_collection.push_back(entry);
   }
 
-  bool Find(const DWARFDIE &die, const Declaration &decl,
-            const int32_t byte_size, UniqueDWARFASTType &entry) const;
+  UniqueDWARFASTType *Find(const DWARFDIE &die, const Declaration &decl,
+                           const int32_t byte_size,
+                           bool is_forward_declaration);
 
 protected:
   typedef std::vector<UniqueDWARFASTType> collection;
@@ -80,14 +69,15 @@ class UniqueDWARFASTTypeMap {
     m_collection[name.GetCString()].Append(entry);
   }
 
-  bool Find(ConstString name, const DWARFDIE &die, const Declaration &decl,
-            const int32_t byte_size, UniqueDWARFASTType &entry) const {
+  UniqueDWARFASTType *Find(ConstString name, const DWARFDIE &die,
+                           const Declaration &decl, const int32_t byte_size,
+                           bool is_forward_declaration) {
     const char *unique_name_cstr = name.GetCString();
-    collection::const_iterator pos = m_collection.find(unique_name_cstr);
+    collection::iterator pos = m_collection.find(unique_name_cstr);
     if (pos != m_collection.end()) {
-      return pos->second.Find(die, decl, byte_size, entry);
+      return pos->second.Find(die, decl, byte_size, is_forward_declaration);
     }
-    return false;
+    return nullptr;
   }
 
 protected:
diff --git a/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test
new file mode 100644
index 00000000000000..d253981b498c81
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test
@@ -0,0 +1,36 @@
+# Test definition DIE searching is delayed until complete type is required.
+
+# UNSUPPORTED: system-windows
+
+# RUN: split-file %s %t
+# RUN: %clangxx_host %t/main.cpp %t/t1_def.cpp -gdwarf -o %t.out
+# RUN: %lldb -b %t.out -s %t/lldb.cmd | FileCheck %s
+
+# CHECK: (lldb) p v1
+# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't2<t1>'
+# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1'
+# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't2<t1>' resolving forward declaration...
+# CHECK: (t2<t1>)  {}
+# CHECK: (lldb) p v2
+# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1'
+# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't1' resolving forward declaration...
+
+#--- lldb.cmd
+log enable dwarf comp
+p v1
+p v2
+
+#--- main.cpp
+template<typename T>
+struct t2 {
+};
+struct t1;
+t2<t1> v1; // this CU doesn't have definition DIE for t1, but only declaration DIE for it.
+int main() {
+}
+
+#--- t1_def.cpp
+struct t1 { // this CU contains definition DIE for t1.
+  int x;
+};
+t1 v2;

From d490ce22e93db2e9d57985bc50915e383327911f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 08:51:42 -0700
Subject: [PATCH 24/89] [RISCV] Use mask undisturbed policy when silencing
 sNans for strict rounding ops. (#93356)

The elements that aren't sNans need to get passed through this fadd
instruction unchanged. With the agnostic mask policy they might be
forced to all ones.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  2 +-
 .../RISCV/rvv/fceil-constrained-sdnode.ll     | 45 ++++++++++++-------
 .../RISCV/rvv/ffloor-constrained-sdnode.ll    | 45 ++++++++++++-------
 .../fixed-vectors-fceil-constrained-sdnode.ll | 45 ++++++++++++-------
 ...fixed-vectors-ffloor-constrained-sdnode.ll | 45 ++++++++++++-------
 ...d-vectors-fnearbyint-constrained-sdnode.ll | 36 ++++++++++-----
 ...fixed-vectors-fround-constrained-sdnode.ll | 45 ++++++++++++-------
 ...d-vectors-froundeven-constrained-sdnode.ll | 45 ++++++++++++-------
 ...fixed-vectors-ftrunc-constrained-sdnode.ll | 45 ++++++++++++-------
 .../rvv/fnearbyint-constrained-sdnode.ll      | 45 ++++++++++++-------
 .../RISCV/rvv/fround-constrained-sdnode.ll    | 45 ++++++++++++-------
 .../rvv/froundeven-constrained-sdnode.ll      | 45 ++++++++++++-------
 .../RISCV/rvv/ftrunc-constrained-sdnode.ll    | 45 ++++++++++++-------
 13 files changed, 355 insertions(+), 178 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f0e5a7d393b6c9..c826892c1668ec 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3125,7 +3125,7 @@ lowerVectorStrictFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG,
   Chain = Unorder.getValue(1);
   Src = DAG.getNode(RISCVISD::STRICT_FADD_VL, DL,
                     DAG.getVTList(ContainerVT, MVT::Other),
-                    {Chain, Src, Src, DAG.getUNDEF(ContainerVT), Unorder, VL});
+                    {Chain, Src, Src, Src, Unorder, VL});
   Chain = Src.getValue(1);
 
   // We do the conversion on the absolute value and fix the sign at the end.
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
index 75747a6674b7b4..d8781495abd75c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
@@ -7,7 +7,7 @@
 define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -15,6 +15,7 @@ define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -29,7 +30,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.ceil.nxv1f16(<vscale
 define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -37,6 +38,7 @@ define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -51,7 +53,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.ceil.nxv2f16(<vscale
 define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -59,6 +61,7 @@ define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -73,7 +76,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.ceil.nxv4f16(<vscale
 define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -81,6 +84,7 @@ define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -95,7 +99,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.ceil.nxv8f16(<vscale
 define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -103,6 +107,7 @@ define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -117,7 +122,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.ceil.nxv16f16(<vscal
 define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -125,6 +130,7 @@ define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -139,7 +145,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.ceil.nxv32f16(<vscal
 define <vscale x 1 x float> @ceil_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -147,6 +153,7 @@ define <vscale x 1 x float> @ceil_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -161,7 +168,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.ceil.nxv1f32(<vscale
 define <vscale x 2 x float> @ceil_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -169,6 +176,7 @@ define <vscale x 2 x float> @ceil_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -183,7 +191,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.ceil.nxv2f32(<vscale
 define <vscale x 4 x float> @ceil_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -191,6 +199,7 @@ define <vscale x 4 x float> @ceil_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -205,7 +214,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.ceil.nxv4f32(<vscale
 define <vscale x 8 x float> @ceil_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -213,6 +222,7 @@ define <vscale x 8 x float> @ceil_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -227,7 +237,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.ceil.nxv8f32(<vscale
 define <vscale x 16 x float> @ceil_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -235,6 +245,7 @@ define <vscale x 16 x float> @ceil_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -249,7 +260,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.ceil.nxv16f32(<vsca
 define <vscale x 1 x double> @ceil_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -257,6 +268,7 @@ define <vscale x 1 x double> @ceil_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -271,7 +283,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.ceil.nxv1f64(<vscal
 define <vscale x 2 x double> @ceil_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -279,6 +291,7 @@ define <vscale x 2 x double> @ceil_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -293,7 +306,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.ceil.nxv2f64(<vscal
 define <vscale x 4 x double> @ceil_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -301,6 +314,7 @@ define <vscale x 4 x double> @ceil_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -315,7 +329,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.ceil.nxv4f64(<vscal
 define <vscale x 8 x double> @ceil_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -323,6 +337,7 @@ define <vscale x 8 x double> @ceil_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
index 31a94532044574..1df452d8641c58 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
@@ -7,7 +7,7 @@
 define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -15,6 +15,7 @@ define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -29,7 +30,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.floor.nxv1f16(<vscale
 define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -37,6 +38,7 @@ define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -51,7 +53,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.floor.nxv2f16(<vscale
 define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -59,6 +61,7 @@ define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -73,7 +76,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.floor.nxv4f16(<vscale
 define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -81,6 +84,7 @@ define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -95,7 +99,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.floor.nxv8f16(<vscale
 define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -103,6 +107,7 @@ define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -117,7 +122,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.floor.nxv16f16(<vsca
 define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -125,6 +130,7 @@ define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -139,7 +145,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.floor.nxv32f16(<vsca
 define <vscale x 1 x float> @floor_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -147,6 +153,7 @@ define <vscale x 1 x float> @floor_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -161,7 +168,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.floor.nxv1f32(<vscal
 define <vscale x 2 x float> @floor_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -169,6 +176,7 @@ define <vscale x 2 x float> @floor_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -183,7 +191,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.floor.nxv2f32(<vscal
 define <vscale x 4 x float> @floor_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -191,6 +199,7 @@ define <vscale x 4 x float> @floor_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -205,7 +214,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.floor.nxv4f32(<vscal
 define <vscale x 8 x float> @floor_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -213,6 +222,7 @@ define <vscale x 8 x float> @floor_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -227,7 +237,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.floor.nxv8f32(<vscal
 define <vscale x 16 x float> @floor_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -235,6 +245,7 @@ define <vscale x 16 x float> @floor_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -249,7 +260,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.floor.nxv16f32(<vsc
 define <vscale x 1 x double> @floor_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -257,6 +268,7 @@ define <vscale x 1 x double> @floor_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -271,7 +283,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.floor.nxv1f64(<vsca
 define <vscale x 2 x double> @floor_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -279,6 +291,7 @@ define <vscale x 2 x double> @floor_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -293,7 +306,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.floor.nxv2f64(<vsca
 define <vscale x 4 x double> @floor_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -301,6 +314,7 @@ define <vscale x 4 x double> @floor_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -315,7 +329,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.floor.nxv4f64(<vsca
 define <vscale x 8 x double> @floor_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -323,6 +337,7 @@ define <vscale x 8 x double> @floor_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
index 1e93a73ede5d65..404fb72b8abe91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
@@ -7,7 +7,7 @@
 define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -15,6 +15,7 @@ define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -29,7 +30,7 @@ declare <1 x half> @llvm.experimental.constrained.ceil.v1f16(<1 x half>, metadat
 define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -37,6 +38,7 @@ define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -51,7 +53,7 @@ declare <2 x half> @llvm.experimental.constrained.ceil.v2f16(<2 x half>, metadat
 define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -59,6 +61,7 @@ define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -73,7 +76,7 @@ declare <4 x half> @llvm.experimental.constrained.ceil.v4f16(<4 x half>, metadat
 define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -81,6 +84,7 @@ define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -95,7 +99,7 @@ declare <8 x half> @llvm.experimental.constrained.ceil.v8f16(<8 x half>, metadat
 define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -103,6 +107,7 @@ define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -118,7 +123,7 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -126,6 +131,7 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -140,7 +146,7 @@ declare <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half>, meta
 define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -148,6 +154,7 @@ define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -162,7 +169,7 @@ declare <1 x float> @llvm.experimental.constrained.ceil.v1f32(<1 x float>, metad
 define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -170,6 +177,7 @@ define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -184,7 +192,7 @@ declare <2 x float> @llvm.experimental.constrained.ceil.v2f32(<2 x float>, metad
 define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -192,6 +200,7 @@ define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -206,7 +215,7 @@ declare <4 x float> @llvm.experimental.constrained.ceil.v4f32(<4 x float>, metad
 define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -214,6 +223,7 @@ define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -228,7 +238,7 @@ declare <8 x float> @llvm.experimental.constrained.ceil.v8f32(<8 x float>, metad
 define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -236,6 +246,7 @@ define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -250,7 +261,7 @@ declare <16 x float> @llvm.experimental.constrained.ceil.v16f32(<16 x float>, me
 define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -258,6 +269,7 @@ define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -272,7 +284,7 @@ declare <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double>, met
 define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -280,6 +292,7 @@ define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -294,7 +307,7 @@ declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, met
 define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -302,6 +315,7 @@ define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -316,7 +330,7 @@ declare <4 x double> @llvm.experimental.constrained.ceil.v4f64(<4 x double>, met
 define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -324,6 +338,7 @@ define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
index 53018939fc6eb4..2319aab370d2de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
@@ -7,7 +7,7 @@
 define <1 x half> @floor_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -15,6 +15,7 @@ define <1 x half> @floor_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -29,7 +30,7 @@ declare <1 x half> @llvm.experimental.constrained.floor.v1f16(<1 x half>, metada
 define <2 x half> @floor_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -37,6 +38,7 @@ define <2 x half> @floor_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -51,7 +53,7 @@ declare <2 x half> @llvm.experimental.constrained.floor.v2f16(<2 x half>, metada
 define <4 x half> @floor_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -59,6 +61,7 @@ define <4 x half> @floor_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -73,7 +76,7 @@ declare <4 x half> @llvm.experimental.constrained.floor.v4f16(<4 x half>, metada
 define <8 x half> @floor_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -81,6 +84,7 @@ define <8 x half> @floor_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -95,7 +99,7 @@ declare <8 x half> @llvm.experimental.constrained.floor.v8f16(<8 x half>, metada
 define <16 x half> @floor_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -103,6 +107,7 @@ define <16 x half> @floor_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -118,7 +123,7 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -126,6 +131,7 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -140,7 +146,7 @@ declare <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half>, met
 define <1 x float> @floor_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -148,6 +154,7 @@ define <1 x float> @floor_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -162,7 +169,7 @@ declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, meta
 define <2 x float> @floor_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -170,6 +177,7 @@ define <2 x float> @floor_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -184,7 +192,7 @@ declare <2 x float> @llvm.experimental.constrained.floor.v2f32(<2 x float>, meta
 define <4 x float> @floor_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -192,6 +200,7 @@ define <4 x float> @floor_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -206,7 +215,7 @@ declare <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float>, meta
 define <8 x float> @floor_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -214,6 +223,7 @@ define <8 x float> @floor_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -228,7 +238,7 @@ declare <8 x float> @llvm.experimental.constrained.floor.v8f32(<8 x float>, meta
 define <16 x float> @floor_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -236,6 +246,7 @@ define <16 x float> @floor_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -250,7 +261,7 @@ declare <16 x float> @llvm.experimental.constrained.floor.v16f32(<16 x float>, m
 define <1 x double> @floor_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -258,6 +269,7 @@ define <1 x double> @floor_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -272,7 +284,7 @@ declare <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double>, me
 define <2 x double> @floor_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -280,6 +292,7 @@ define <2 x double> @floor_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -294,7 +307,7 @@ declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, me
 define <4 x double> @floor_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -302,6 +315,7 @@ define <4 x double> @floor_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -316,7 +330,7 @@ declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, me
 define <8 x double> @floor_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -324,6 +338,7 @@ define <8 x double> @floor_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
index 9e9a8b8a4b644e..719dd524942846 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
@@ -9,7 +9,7 @@ declare <2 x half> @llvm.experimental.constrained.nearbyint.v2f16(<2 x half>, me
 define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -32,7 +33,7 @@ declare <4 x half> @llvm.experimental.constrained.nearbyint.v4f16(<4 x half>, me
 define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -40,6 +41,7 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
@@ -55,7 +57,7 @@ declare <8 x half> @llvm.experimental.constrained.nearbyint.v8f16(<8 x half>, me
 define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -63,6 +65,7 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
@@ -78,7 +81,7 @@ declare <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half>,
 define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -86,6 +89,7 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -102,7 +106,7 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -110,6 +114,7 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -125,7 +130,7 @@ declare <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float>,
 define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -133,6 +138,7 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -148,7 +154,7 @@ declare <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float>,
 define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -156,6 +162,7 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
@@ -171,7 +178,7 @@ declare <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float>,
 define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -179,6 +186,7 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
@@ -194,7 +202,7 @@ declare <16 x float> @llvm.experimental.constrained.nearbyint.v16f32(<16 x float
 define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -202,6 +210,7 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
@@ -217,7 +226,7 @@ declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>
 define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI9_0)(a0)
@@ -225,6 +234,7 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -240,7 +250,7 @@ declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double>
 define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI10_0)(a0)
@@ -248,6 +258,7 @@ define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -263,7 +274,7 @@ declare <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double>
 define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -271,6 +282,7 @@ define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
index f189354237ee3a..e855d9504ff404 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
@@ -9,7 +9,7 @@
 define <1 x half> @round_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: round_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <1 x half> @round_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,7 +32,7 @@ declare <1 x half> @llvm.experimental.constrained.round.v1f16(<1 x half>, metada
 define <2 x half> @round_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: round_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -39,6 +40,7 @@ define <2 x half> @round_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -53,7 +55,7 @@ declare <2 x half> @llvm.experimental.constrained.round.v2f16(<2 x half>, metada
 define <4 x half> @round_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: round_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -61,6 +63,7 @@ define <4 x half> @round_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -75,7 +78,7 @@ declare <4 x half> @llvm.experimental.constrained.round.v4f16(<4 x half>, metada
 define <8 x half> @round_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: round_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -83,6 +86,7 @@ define <8 x half> @round_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -97,7 +101,7 @@ declare <8 x half> @llvm.experimental.constrained.round.v8f16(<8 x half>, metada
 define <16 x half> @round_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: round_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -105,6 +109,7 @@ define <16 x half> @round_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -120,7 +125,7 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: round_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -128,6 +133,7 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -142,7 +148,7 @@ declare <32 x half> @llvm.experimental.constrained.round.v32f16(<32 x half>, met
 define <1 x float> @round_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: round_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -150,6 +156,7 @@ define <1 x float> @round_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -164,7 +171,7 @@ declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, meta
 define <2 x float> @round_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: round_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -172,6 +179,7 @@ define <2 x float> @round_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -186,7 +194,7 @@ declare <2 x float> @llvm.experimental.constrained.round.v2f32(<2 x float>, meta
 define <4 x float> @round_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: round_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -194,6 +202,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -208,7 +217,7 @@ declare <4 x float> @llvm.experimental.constrained.round.v4f32(<4 x float>, meta
 define <8 x float> @round_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: round_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -216,6 +225,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -230,7 +240,7 @@ declare <8 x float> @llvm.experimental.constrained.round.v8f32(<8 x float>, meta
 define <16 x float> @round_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: round_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -238,6 +248,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -252,7 +263,7 @@ declare <16 x float> @llvm.experimental.constrained.round.v16f32(<16 x float>, m
 define <1 x double> @round_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: round_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -260,6 +271,7 @@ define <1 x double> @round_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -274,7 +286,7 @@ declare <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double>, me
 define <2 x double> @round_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: round_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -282,6 +294,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -296,7 +309,7 @@ declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, me
 define <4 x double> @round_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: round_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -304,6 +317,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -318,7 +332,7 @@ declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, me
 define <8 x double> @round_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: round_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -326,6 +340,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
index 11920c7c31c981..9976cd2a8ab29a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
@@ -9,7 +9,7 @@
 define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,7 +32,7 @@ declare <1 x half> @llvm.experimental.constrained.roundeven.v1f16(<1 x half>, me
 define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -39,6 +40,7 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -53,7 +55,7 @@ declare <2 x half> @llvm.experimental.constrained.roundeven.v2f16(<2 x half>, me
 define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -61,6 +63,7 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -75,7 +78,7 @@ declare <4 x half> @llvm.experimental.constrained.roundeven.v4f16(<4 x half>, me
 define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -83,6 +86,7 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -97,7 +101,7 @@ declare <8 x half> @llvm.experimental.constrained.roundeven.v8f16(<8 x half>, me
 define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -105,6 +109,7 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -120,7 +125,7 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -128,6 +133,7 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -142,7 +148,7 @@ declare <32 x half> @llvm.experimental.constrained.roundeven.v32f16(<32 x half>,
 define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -150,6 +156,7 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -164,7 +171,7 @@ declare <1 x float> @llvm.experimental.constrained.roundeven.v1f32(<1 x float>,
 define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -172,6 +179,7 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -186,7 +194,7 @@ declare <2 x float> @llvm.experimental.constrained.roundeven.v2f32(<2 x float>,
 define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -194,6 +202,7 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -208,7 +217,7 @@ declare <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float>,
 define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -216,6 +225,7 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -230,7 +240,7 @@ declare <8 x float> @llvm.experimental.constrained.roundeven.v8f32(<8 x float>,
 define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -238,6 +248,7 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -252,7 +263,7 @@ declare <16 x float> @llvm.experimental.constrained.roundeven.v16f32(<16 x float
 define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -260,6 +271,7 @@ define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -274,7 +286,7 @@ declare <1 x double> @llvm.experimental.constrained.roundeven.v1f64(<1 x double>
 define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -282,6 +294,7 @@ define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -296,7 +309,7 @@ declare <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double>
 define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -304,6 +317,7 @@ define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -318,7 +332,7 @@ declare <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double>
 define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -326,6 +340,7 @@ define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
index f16581444afca5..eac26451d5a8cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
@@ -7,13 +7,14 @@
 define <1 x half> @trunc_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -27,13 +28,14 @@ declare <1 x half> @llvm.experimental.constrained.trunc.v1f16(<1 x half>, metada
 define <2 x half> @trunc_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -47,13 +49,14 @@ declare <2 x half> @llvm.experimental.constrained.trunc.v2f16(<2 x half>, metada
 define <4 x half> @trunc_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
@@ -67,13 +70,14 @@ declare <4 x half> @llvm.experimental.constrained.trunc.v4f16(<4 x half>, metada
 define <8 x half> @trunc_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
@@ -87,13 +91,14 @@ declare <8 x half> @llvm.experimental.constrained.trunc.v8f16(<8 x half>, metada
 define <16 x half> @trunc_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -108,13 +113,14 @@ define <32 x half> @trunc_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -128,13 +134,14 @@ declare <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half>, met
 define <1 x float> @trunc_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -148,13 +155,14 @@ declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, meta
 define <2 x float> @trunc_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -168,13 +176,14 @@ declare <2 x float> @llvm.experimental.constrained.trunc.v2f32(<2 x float>, meta
 define <4 x float> @trunc_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
@@ -188,13 +197,14 @@ declare <4 x float> @llvm.experimental.constrained.trunc.v4f32(<4 x float>, meta
 define <8 x float> @trunc_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
@@ -208,13 +218,14 @@ declare <8 x float> @llvm.experimental.constrained.trunc.v8f32(<8 x float>, meta
 define <16 x float> @trunc_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
@@ -228,13 +239,14 @@ declare <16 x float> @llvm.experimental.constrained.trunc.v16f32(<16 x float>, m
 define <1 x double> @trunc_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -248,13 +260,14 @@ declare <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double>, me
 define <2 x double> @trunc_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -268,13 +281,14 @@ declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, me
 define <4 x double> @trunc_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -288,13 +302,14 @@ declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, me
 define <8 x double> @trunc_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
index f88a9b3081a1a8..372937bb5ca5df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
@@ -9,7 +9,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.nearbyint.nxv1f16(<vs
 define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -32,7 +33,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.nearbyint.nxv2f16(<vs
 define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -40,6 +41,7 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
@@ -55,7 +57,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.nearbyint.nxv4f16(<vs
 define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -63,6 +65,7 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
@@ -78,7 +81,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.nearbyint.nxv8f16(<vs
 define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -86,6 +89,7 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -101,7 +105,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.nearbyint.nxv16f16(<
 define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -109,6 +113,7 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %v) strictf
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -124,7 +129,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.nearbyint.nxv32f16(<
 define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -132,6 +137,7 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %v) strictf
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -147,7 +153,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.nearbyint.nxv1f32(<v
 define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -155,6 +161,7 @@ define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %v) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -170,7 +177,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.nearbyint.nxv2f32(<v
 define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -178,6 +185,7 @@ define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %v) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
@@ -193,7 +201,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.nearbyint.nxv4f32(<v
 define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -201,6 +209,7 @@ define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %v) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
@@ -216,7 +225,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.nearbyint.nxv8f32(<v
 define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -224,6 +233,7 @@ define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %v) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
@@ -239,7 +249,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.nearbyint.nxv16f32(
 define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -247,6 +257,7 @@ define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %v) stric
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
@@ -262,7 +273,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.nearbyint.nxv1f64(<
 define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -270,6 +281,7 @@ define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %v) strict
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -285,7 +297,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.nearbyint.nxv2f64(<
 define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -293,6 +305,7 @@ define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %v) strict
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -308,7 +321,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.nearbyint.nxv4f64(<
 define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -316,6 +329,7 @@ define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %v) strict
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -331,7 +345,7 @@ declare <vscale x 8 x double> @llvm.experimental.constrained.nearbyint.nxv8f64(<
 define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -339,6 +353,7 @@ define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %v) strict
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
index 3276f481f30ea5..aaa7a538e70fb7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
@@ -9,7 +9,7 @@
 define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,7 +32,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.round.nxv1f16(<vscale
 define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -39,6 +40,7 @@ define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -53,7 +55,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.round.nxv2f16(<vscale
 define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -61,6 +63,7 @@ define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -75,7 +78,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.round.nxv4f16(<vscale
 define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -83,6 +86,7 @@ define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -97,7 +101,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.round.nxv8f16(<vscale
 define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -105,6 +109,7 @@ define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -119,7 +124,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.round.nxv16f16(<vsca
 define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -127,6 +132,7 @@ define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -141,7 +147,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.round.nxv32f16(<vsca
 define <vscale x 1 x float> @round_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -149,6 +155,7 @@ define <vscale x 1 x float> @round_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -163,7 +170,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.round.nxv1f32(<vscal
 define <vscale x 2 x float> @round_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -171,6 +178,7 @@ define <vscale x 2 x float> @round_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -185,7 +193,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.round.nxv2f32(<vscal
 define <vscale x 4 x float> @round_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -193,6 +201,7 @@ define <vscale x 4 x float> @round_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -207,7 +216,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.round.nxv4f32(<vscal
 define <vscale x 8 x float> @round_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -215,6 +224,7 @@ define <vscale x 8 x float> @round_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -229,7 +239,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.round.nxv8f32(<vscal
 define <vscale x 16 x float> @round_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -237,6 +247,7 @@ define <vscale x 16 x float> @round_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -251,7 +262,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.round.nxv16f32(<vsc
 define <vscale x 1 x double> @round_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -259,6 +270,7 @@ define <vscale x 1 x double> @round_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -273,7 +285,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.round.nxv1f64(<vsca
 define <vscale x 2 x double> @round_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -281,6 +293,7 @@ define <vscale x 2 x double> @round_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -295,7 +308,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.round.nxv2f64(<vsca
 define <vscale x 4 x double> @round_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -303,6 +316,7 @@ define <vscale x 4 x double> @round_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -317,7 +331,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.round.nxv4f64(<vsca
 define <vscale x 8 x double> @round_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -325,6 +339,7 @@ define <vscale x 8 x double> @round_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
index 4ebfcccbaaa6e6..cdc01d658778bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
@@ -9,7 +9,7 @@
 define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,7 +32,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.roundeven.nxv1f16(<vs
 define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -39,6 +40,7 @@ define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -53,7 +55,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.roundeven.nxv2f16(<vs
 define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -61,6 +63,7 @@ define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -75,7 +78,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.roundeven.nxv4f16(<vs
 define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -83,6 +86,7 @@ define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -97,7 +101,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.roundeven.nxv8f16(<vs
 define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -105,6 +109,7 @@ define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) strictf
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -119,7 +124,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.roundeven.nxv16f16(<
 define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -127,6 +132,7 @@ define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) strictf
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -141,7 +147,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.roundeven.nxv32f16(<
 define <vscale x 1 x float> @roundeven_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -149,6 +155,7 @@ define <vscale x 1 x float> @roundeven_nxv1f32(<vscale x 1 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -163,7 +170,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.roundeven.nxv1f32(<v
 define <vscale x 2 x float> @roundeven_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -171,6 +178,7 @@ define <vscale x 2 x float> @roundeven_nxv2f32(<vscale x 2 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -185,7 +193,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.roundeven.nxv2f32(<v
 define <vscale x 4 x float> @roundeven_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -193,6 +201,7 @@ define <vscale x 4 x float> @roundeven_nxv4f32(<vscale x 4 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -207,7 +216,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.roundeven.nxv4f32(<v
 define <vscale x 8 x float> @roundeven_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -215,6 +224,7 @@ define <vscale x 8 x float> @roundeven_nxv8f32(<vscale x 8 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -229,7 +239,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.roundeven.nxv8f32(<v
 define <vscale x 16 x float> @roundeven_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -237,6 +247,7 @@ define <vscale x 16 x float> @roundeven_nxv16f32(<vscale x 16 x float> %x) stric
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -251,7 +262,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.roundeven.nxv16f32(
 define <vscale x 1 x double> @roundeven_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -259,6 +270,7 @@ define <vscale x 1 x double> @roundeven_nxv1f64(<vscale x 1 x double> %x) strict
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -273,7 +285,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.roundeven.nxv1f64(<
 define <vscale x 2 x double> @roundeven_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -281,6 +293,7 @@ define <vscale x 2 x double> @roundeven_nxv2f64(<vscale x 2 x double> %x) strict
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -295,7 +308,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.roundeven.nxv2f64(<
 define <vscale x 4 x double> @roundeven_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -303,6 +316,7 @@ define <vscale x 4 x double> @roundeven_nxv4f64(<vscale x 4 x double> %x) strict
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -317,7 +331,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.roundeven.nxv4f64(<
 define <vscale x 8 x double> @roundeven_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -325,6 +339,7 @@ define <vscale x 8 x double> @roundeven_nxv8f64(<vscale x 8 x double> %x) strict
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
index 3665669d83a3d4..21615b516da898 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
@@ -7,13 +7,14 @@
 define <vscale x 1 x half> @trunc_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -27,13 +28,14 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.trunc.nxv1f16(<vscale
 define <vscale x 2 x half> @trunc_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
@@ -47,13 +49,14 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.trunc.nxv2f16(<vscale
 define <vscale x 4 x half> @trunc_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
@@ -67,13 +70,14 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.trunc.nxv4f16(<vscale
 define <vscale x 8 x half> @trunc_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -87,13 +91,14 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.trunc.nxv8f16(<vscale
 define <vscale x 16 x half> @trunc_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -107,13 +112,14 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.trunc.nxv16f16(<vsca
 define <vscale x 32 x half> @trunc_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -127,13 +133,14 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.trunc.nxv32f16(<vsca
 define <vscale x 1 x float> @trunc_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -147,13 +154,14 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.trunc.nxv1f32(<vscal
 define <vscale x 2 x float> @trunc_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
@@ -167,13 +175,14 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.trunc.nxv2f32(<vscal
 define <vscale x 4 x float> @trunc_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
@@ -187,13 +196,14 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.trunc.nxv4f32(<vscal
 define <vscale x 8 x float> @trunc_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
@@ -207,13 +217,14 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.trunc.nxv8f32(<vscal
 define <vscale x 16 x float> @trunc_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
@@ -227,13 +238,14 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.trunc.nxv16f32(<vsc
 define <vscale x 1 x double> @trunc_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -247,13 +259,14 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.trunc.nxv1f64(<vsca
 define <vscale x 2 x double> @trunc_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -267,13 +280,14 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.trunc.nxv2f64(<vsca
 define <vscale x 4 x double> @trunc_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -287,13 +301,14 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.trunc.nxv4f64(<vsca
 define <vscale x 8 x double> @trunc_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu

From f284af4863640e6b68918aa23b14498c1b8e2245 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 28 May 2024 10:50:28 -0500
Subject: [PATCH 25/89] [Offload][Fix] Fix lazy initialization with multiple
 images

Summary:
There was a bug here where we would initialize the plugin multiple times
when there were multiple images. Fix it by putting the `is_initliaized`
check later.
---
 offload/src/PluginManager.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index f72007849e36e4..13f08b142b8769 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -155,11 +155,11 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
 
   // Initialize all the plugins that have associated images.
   for (auto &Plugin : Plugins) {
-    if (Plugin->is_initialized())
-      continue;
-
     // Extract the exectuable image and extra information if availible.
     for (int32_t i = 0; i < Desc->NumDeviceImages; ++i) {
+      if (Plugin->is_initialized())
+        continue;
+
       if (!Plugin->is_valid_binary(&Desc->DeviceImages[i],
                                    /*Initialized=*/false))
         continue;

From af22e274e9c5643780f25066442e05b5bd453328 Mon Sep 17 00:00:00 2001
From: Matthias Gehre <matthias.gehre@amd.com>
Date: Tue, 28 May 2024 17:59:23 +0200
Subject: [PATCH 26/89] TosaToTensor: Support reshape on tensors of unsigned
 integer (#91734)

This adds
- `mlir::tosa::populateTosaToLinalgTypeConversion` which converts
tensors of unsigned integers into tensors of signless integers
- modifies the `tosa.reshape` lowering in TosaToTensor to use the type
converter correctly

I choose to implement the type converter in
`mlir/Conversion/TosaToLinalg/TosaToLinalg.h` instead of
`mlir/Conversion/TosaToTensor/TosaToTensor.h` because I need the same
type converter in the TosaToLinalg lowerings (future PR).
Alternatively, I could duplicate the type converter so it exists both in
TosaToLinalg and TosaToTensor. Let me know if you prefer that.
---
 .../Conversion/TosaToTensor/TosaToTensor.h    |  4 +-
 .../mlir/Dialect/Tosa/Transforms/Passes.h     |  3 ++
 .../Conversion/TosaToTensor/TosaToTensor.cpp  | 33 +++++++-----
 .../TosaToTensor/TosaToTensorPass.cpp         |  5 +-
 .../Dialect/Tosa/Transforms/CMakeLists.txt    |  1 +
 .../Tosa/Transforms/TosaTypeConverters.cpp    | 52 +++++++++++++++++++
 .../TosaToTensor/tosa-to-tensor.mlir          | 14 +++++
 7 files changed, 97 insertions(+), 15 deletions(-)
 create mode 100644 mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp

diff --git a/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h b/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h
index 3953c83f3aa106..76a4b1b1563366 100644
--- a/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h
+++ b/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h
@@ -16,6 +16,7 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+class TypeConverter;
 
 #define GEN_PASS_DECL_TOSATOTENSOR
 #include "mlir/Conversion/Passes.h.inc"
@@ -24,7 +25,8 @@ namespace tosa {
 
 std::unique_ptr<Pass> createTosaToTensor();
 
-void populateTosaToTensorConversionPatterns(RewritePatternSet *patterns);
+void populateTosaToTensorConversionPatterns(TypeConverter &converter,
+                                            RewritePatternSet *patterns);
 
 } // namespace tosa
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
index fbfc56dfe2cf4f..1f9522b51a4cf5 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
@@ -18,6 +18,7 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+class TypeConverter;
 namespace tosa {
 
 #define GEN_PASS_DECL
@@ -38,6 +39,8 @@ void populateTosaConstantReduction(MLIRContext *ctx,
                                    RewritePatternSet &patterns,
                                    bool aggressiveReduceConstant);
 
+void populateTosaTypeConversion(TypeConverter &converter);
+
 std::unique_ptr<Pass> createTosaLayerwiseConstantFoldPass();
 std::unique_ptr<Pass> createTosaLayerwiseConstantFoldPass(
     const TosaLayerwiseConstantFoldPassOptions &options);
diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
index 89f956a5e70175..c0c015ab34aab0 100644
--- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
+++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
@@ -224,8 +224,17 @@ class ReshapeConverter : public OpConversionPattern<tosa::ReshapeOp> {
   matchAndRewrite(tosa::ReshapeOp reshape, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const final {
     auto loc = reshape.getLoc();
-    auto resultType = reshape.getResult().getType();
-    auto input = reshape.getInput1();
+    auto resultType = cast_if_present<ShapedType>(
+        getTypeConverter()->convertType(reshape.getType()));
+    if (!resultType) {
+      return rewriter.notifyMatchFailure(reshape.getLoc(),
+                                         "could not convert result type");
+    }
+    auto input = dyn_cast<TypedValue<TensorType>>(adaptor.getInput1());
+    if (!input) {
+      return rewriter.notifyMatchFailure(reshape.getLoc(),
+                                         "expected input type to be tensor");
+    }
     auto newShape = reshape.getNewShape();
 
     // Infer all intermediate types
@@ -288,12 +297,13 @@ class SliceConverter : public OpConversionPattern<tosa::SliceOp> {
   }
 };
 
-class PadConverter : public OpRewritePattern<tosa::PadOp> {
+class PadConverter : public OpConversionPattern<tosa::PadOp> {
 public:
-  using OpRewritePattern<tosa::PadOp>::OpRewritePattern;
+  using OpConversionPattern::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(tosa::PadOp padOp,
-                                PatternRewriter &rewriter) const final {
+  LogicalResult
+  matchAndRewrite(tosa::PadOp padOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
     auto loc = padOp.getLoc();
     auto input = padOp.getInput1();
     auto padding = padOp.getPadding();
@@ -428,11 +438,8 @@ struct ConcatConverter : public OpConversionPattern<tosa::ConcatOp> {
 } // namespace
 
 void mlir::tosa::populateTosaToTensorConversionPatterns(
-    RewritePatternSet *patterns) {
-  patterns->add<
-    ConcatConverter,
-    PadConverter,
-    ReshapeConverter,
-    SliceConverter
-  >(patterns->getContext());
+    TypeConverter &converter, RewritePatternSet *patterns) {
+  patterns
+      ->add<ConcatConverter, PadConverter, ReshapeConverter, SliceConverter>(
+          converter, patterns->getContext());
 }
diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp
index 50dc55667fb94e..fa1c2cf7fba986 100644
--- a/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp
+++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp
@@ -42,7 +42,10 @@ struct TosaToTensor : public impl::TosaToTensorBase<TosaToTensor> {
     target.addLegalDialect<arith::ArithDialect>();
     target.addLegalDialect<tensor::TensorDialect>();
 
-    mlir::tosa::populateTosaToTensorConversionPatterns(&patterns);
+    TypeConverter converter;
+    mlir::tosa::populateTosaTypeConversion(converter);
+
+    mlir::tosa::populateTosaToTensorConversionPatterns(converter, &patterns);
 
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
diff --git a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
index 0e6510ba1e9255..c78a74b874aff1 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRTosaTransforms
   TosaLayerwiseConstantFoldPass.cpp
   TosaMakeBroadcastable.cpp
   TosaOptionalDecompositions.cpp
+  TosaTypeConverters.cpp
   TosaValidation.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp
new file mode 100644
index 00000000000000..d2650de8cd7f02
--- /dev/null
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp
@@ -0,0 +1,52 @@
+
+//===- TosaTypeConverters.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Type converters for lowering TOSA to linalg/arith.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Tosa/Transforms/Passes.h"
+
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+void mlir::tosa::populateTosaTypeConversion(TypeConverter &converter) {
+  converter.addConversion([&](Type type) -> std::optional<Type> {
+    if (type.isUnsignedInteger()) {
+      return IntegerType::get(type.getContext(), type.getIntOrFloatBitWidth(),
+                              IntegerType::SignednessSemantics::Signless);
+    }
+    return type;
+  });
+  converter.addConversion([&](TensorType type) -> std::optional<Type> {
+    auto converted = converter.convertType(type.getElementType());
+    if (!converted)
+      return {};
+    return type.clone(converted);
+  });
+  converter.addSourceMaterialization([&](OpBuilder &builder, Type resultType,
+                                         ValueRange inputs,
+                                         Location loc) -> std::optional<Value> {
+    if (inputs.size() != 1)
+      return std::nullopt;
+
+    return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+        .getResult(0);
+  });
+  converter.addTargetMaterialization([&](OpBuilder &builder, Type resultType,
+                                         ValueRange inputs,
+                                         Location loc) -> std::optional<Value> {
+    if (inputs.size() != 1)
+      return std::nullopt;
+
+    return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+        .getResult(0);
+  });
+}
diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
index 72e7e4cc840886..1e62e25176a007 100644
--- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
+++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
@@ -420,6 +420,20 @@ func.func @test_reshape_6d_down_s2s_explicit(%arg0: tensor<1x2x3x5x7x11xf32>) ->
 
 // -----
 
+// CHECK-LABEL: @test_reshape_samerank_unsigned
+//  CHECK-SAME: (%[[ARG0:.*]]: tensor<3x2xui8>)
+func.func @test_reshape_samerank_unsigned(%arg0: tensor<3x2xui8>) -> tensor<2x3xui8> {
+  // CHECK-NEXT: %[[CAST1:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : tensor<3x2xui8> to tensor<3x2xi8>
+  // CHECK-NEXT: %[[RESHAPE1:.*]] = tensor.collapse_shape %[[CAST1]] {{\[}}[0, 1]] : tensor<3x2xi8> into tensor<6xi8>
+  // CHECK-NEXT: %[[RESHAPE2:.*]] = tensor.expand_shape %[[RESHAPE1]] {{\[}}[0, 1]] output_shape {{\[}}2, 3] : tensor<6xi8> into tensor<2x3xi8>
+  // CHECK-NEXT: %[[CAST2:.*]] = builtin.unrealized_conversion_cast %[[RESHAPE2]] : tensor<2x3xi8> to tensor<2x3xui8
+  %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 3>} : (tensor<3x2xui8>) -> tensor<2x3xui8>
+  // CHECK-NEXT: return %[[CAST2]]
+  return %0 : tensor<2x3xui8>
+}
+
+// -----
+
 // CHECK-LABEL: func @slice
 func.func @slice(%arg0: tensor<6xf32>) ->() {
   // CHECK: [[SLICE:%.+]] = tensor.extract_slice %arg0[2] [1] [1]

From fea7399e97b73a3209fcbe3338d412069769a637 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 28 May 2024 09:03:06 -0700
Subject: [PATCH 27/89] [clang] Fix a warning

This patch fixes:

  clang/unittests/Interpreter/IncrementalProcessingTest.cpp:39:13:
  error: unused function 'HostSupportsJit' [-Werror,-Wunused-function]
---
 clang/unittests/Interpreter/IncrementalProcessingTest.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
index 54159173d91e39..f3b091b0c0e6cb 100644
--- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
+++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
@@ -36,14 +36,6 @@ using namespace clang;
 
 namespace {
 
-static bool HostSupportsJit() {
-  auto J = llvm::orc::LLJITBuilder().create();
-  if (J)
-    return true;
-  LLVMConsumeError(llvm::wrap(J.takeError()));
-  return false;
-}
-
 // Incremental processing produces several modules, all using the same "main
 // file". Make sure CodeGen can cope with that, e.g. for static initializers.
 const char TestProgram1[] = "extern \"C\" int funcForProg1() { return 17; }\n"

From 273777ead296c9ab2c157d16b750e3ee1ace08ec Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 28 May 2024 12:04:44 -0400
Subject: [PATCH 28/89] clang:: to llvm::; NFC

These interfaces are LLVM interfaces, not Clang ones; but this worked
because of LLVM.h adding the interfaces to the clang namespace.
---
 clang/lib/AST/APValue.cpp                    | 2 +-
 clang/lib/Analysis/MacroExpansionContext.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/APValue.cpp b/clang/lib/AST/APValue.cpp
index 8c77b563657d90..d8e33ff421c06c 100644
--- a/clang/lib/AST/APValue.cpp
+++ b/clang/lib/AST/APValue.cpp
@@ -90,7 +90,7 @@ QualType APValue::LValueBase::getType() const {
   // For a materialized temporary, the type of the temporary we materialized
   // may not be the type of the expression.
   if (const MaterializeTemporaryExpr *MTE =
-          clang::dyn_cast<MaterializeTemporaryExpr>(Base)) {
+          llvm::dyn_cast<MaterializeTemporaryExpr>(Base)) {
     SmallVector<const Expr *, 2> CommaLHSs;
     SmallVector<SubobjectAdjustment, 2> Adjustments;
     const Expr *Temp = MTE->getSubExpr();
diff --git a/clang/lib/Analysis/MacroExpansionContext.cpp b/clang/lib/Analysis/MacroExpansionContext.cpp
index 564e359668a510..b212b7f2457927 100644
--- a/clang/lib/Analysis/MacroExpansionContext.cpp
+++ b/clang/lib/Analysis/MacroExpansionContext.cpp
@@ -12,7 +12,7 @@
 
 #define DEBUG_TYPE "macro-expansion-context"
 
-static void dumpTokenInto(const clang::Preprocessor &PP, clang::raw_ostream &OS,
+static void dumpTokenInto(const clang::Preprocessor &PP, llvm::raw_ostream &OS,
                           clang::Token Tok);
 
 namespace clang {

From 259caad2f75011174d39615bb0ba31955d16d498 Mon Sep 17 00:00:00 2001
From: Oleksandr T <oleksandr.tarasiuk@outlook.com>
Date: Tue, 28 May 2024 19:08:38 +0300
Subject: [PATCH 29/89] [Clang] Fix an assertion failure when checking invalid
 `this` (#93490)

Skip explicit this check in non-valid scopes due to `null` type in
lambdas with invalid captures or incomplete parameter lists during
parsing


Fixes #91536
---
 clang/docs/ReleaseNotes.rst                   | 1 +
 clang/lib/Sema/SemaExprCXX.cpp                | 6 +++---
 clang/test/SemaCXX/invalid-this-in-lambda.cpp | 4 ++++
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/SemaCXX/invalid-this-in-lambda.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6b746cda53c71b..173e61fbf7b2c1 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -803,6 +803,7 @@ Bug Fixes to C++ Support
   with the same parameters not to be diagnosed. (Fixes #GH93456).
 - Clang now diagnoses unexpanded parameter packs in attributes. (Fixes #GH93269).
 - Clang now allows ``@$``` in raw string literals. Fixes (#GH93130).
+- Fix an assertion failure when checking invalid ``this`` usage in the wrong context. (Fixes #GH91536).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index d3e9dcb4f4399a..6595abbcdda5b1 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1444,10 +1444,10 @@ bool Sema::CheckCXXThisType(SourceLocation Loc, QualType Type) {
   //   category are defined within such member functions as they are within
   //   an implicit object member function).
   DeclContext *DC = getFunctionLevelDeclContext();
-  if (const auto *Method = dyn_cast<CXXMethodDecl>(DC);
-      Method && Method->isExplicitObjectMemberFunction()) {
+  const auto *Method = dyn_cast<CXXMethodDecl>(DC);
+  if (Method && Method->isExplicitObjectMemberFunction()) {
     Diag(Loc, diag::err_invalid_this_use) << 1;
-  } else if (isLambdaCallWithExplicitObjectParameter(CurContext)) {
+  } else if (Method && isLambdaCallWithExplicitObjectParameter(CurContext)) {
     Diag(Loc, diag::err_invalid_this_use) << 1;
   } else {
     Diag(Loc, diag::err_invalid_this_use) << 0;
diff --git a/clang/test/SemaCXX/invalid-this-in-lambda.cpp b/clang/test/SemaCXX/invalid-this-in-lambda.cpp
new file mode 100644
index 00000000000000..ae65bda025e232
--- /dev/null
+++ b/clang/test/SemaCXX/invalid-this-in-lambda.cpp
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
+decltype([]()->decltype(this) { }) a; // expected-error {{invalid use of 'this' outside of a non-static member function}}
+

From 234cc40adc610a55d1a5a2fe798a9dd07b993f0c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 28 May 2024 09:23:02 -0700
Subject: [PATCH 30/89] [LAA] Limit no-overlap check to at least one
 loop-invariant accesses.

Limit the logic added in https://github.com/llvm/llvm-project/pull/9230
to cases where either sink or source are loop-invariant, to avoid
compile-time increases. This is not needed for correctness.

I am working on follow-up changes to reduce the compile-time impact in
general to allow us to enable this again for any source/sink.

This should fix the compile-time regression introduced by this change:

* compile-time improvement with this change:
  https://llvm-compile-time-tracker.com/compare.php?from=4351787fb650da6d1bfb8d6e58753c90dcd4c418&to=b89010a2eb5f98494787c1c3b77f25208c59090c&stat=instructions:u

* compile-time improvement with original patch reverted on top of this
  change:
  https://llvm-compile-time-tracker.com/compare.php?from=b89010a2eb5f98494787c1c3b77f25208c59090c&to=19a1103fe68115cfd7d6472c6961f4fabe81a593&stat=instructions:u
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 33 +++++++++++--------
 .../LoopAccessAnalysis/depend_diff_types.ll   | 10 +++++-
 .../non-constant-strides-backward.ll          |  7 +++-
 3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index bc8b9b8479e4ff..bd4c2a35ebf2cb 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1983,20 +1983,25 @@ getDependenceDistanceStrideAndSize(
     return MemoryDepChecker::Dependence::IndirectUnsafe;
 
   // Check if we can prove that Sink only accesses memory after Src's end or
-  // vice versa.
-  const auto &[SrcStart, SrcEnd] =
-      getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE);
-  const auto &[SinkStart, SinkEnd] =
-      getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE);
-
-  if (!isa<SCEVCouldNotCompute>(SrcStart) &&
-      !isa<SCEVCouldNotCompute>(SrcEnd) &&
-      !isa<SCEVCouldNotCompute>(SinkStart) &&
-      !isa<SCEVCouldNotCompute>(SinkEnd)) {
-    if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart))
-      return MemoryDepChecker::Dependence::NoDep;
-    if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart))
-      return MemoryDepChecker::Dependence::NoDep;
+  // vice versa. At the moment this is limited to cases where either source or
+  // sink are loop invariant to avoid compile-time increases. This is not
+  // required for correctness.
+  if (SE.isLoopInvariant(Src, InnermostLoop) ||
+      SE.isLoopInvariant(Sink, InnermostLoop)) {
+    const auto &[SrcStart, SrcEnd] =
+        getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE);
+    const auto &[SinkStart, SinkEnd] =
+        getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE);
+
+    if (!isa<SCEVCouldNotCompute>(SrcStart) &&
+        !isa<SCEVCouldNotCompute>(SrcEnd) &&
+        !isa<SCEVCouldNotCompute>(SinkStart) &&
+        !isa<SCEVCouldNotCompute>(SinkEnd)) {
+      if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart))
+        return MemoryDepChecker::Dependence::NoDep;
+      if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart))
+        return MemoryDepChecker::Dependence::NoDep;
+    }
   }
 
   // Need accesses with constant strides and the same direction. We don't want
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 809b15b2004952..81d8b01fe7fb72 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -130,8 +130,16 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
 ; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:  Unknown data dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %ld.i64 = load i64, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
+; CHECK-EMPTY:
 ; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
 ; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
 ; CHECK-NEXT:            store double %val, ptr %gep.iv.101.i64, align 8
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
index 845ff078ee0eb4..416742a94e0d36 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
@@ -45,8 +45,13 @@ exit:
 define void @different_non_constant_strides_known_backward_distance_larger_than_trip_count(ptr %A) {
 ; CHECK-LABEL: 'different_non_constant_strides_known_backward_distance_larger_than_trip_count'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %gep.mul.2, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:

From d582958618cc5aeff58c512399bef7b263fedd59 Mon Sep 17 00:00:00 2001
From: AtariDreams <gfunni234@gmail.com>
Date: Tue, 28 May 2024 12:25:43 -0400
Subject: [PATCH 31/89] Revert "[Legalizer] Check full condition for UMIN and
 UMAX just like the code below does for SMIN and SMAX" (#93573)

Reverts llvm/llvm-project#87932
---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index c04f7208c61f2a..d8b0f52ecf9e32 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3972,7 +3972,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     // target can override this with custom lowering and calling the
     // implementation functions.
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    if (LI.isLegalOrCustom({G_UMIN, Ty}) && LI.isLegalOrCustom({G_UMAX, Ty}))
+    if (LI.isLegalOrCustom({G_UMIN, Ty}))
       return lowerAddSubSatToMinMax(MI);
     return lowerAddSubSatToAddoSubo(MI);
   }

From 42944e4600827738fae868f0df831fb2678be8b4 Mon Sep 17 00:00:00 2001
From: Miro Bucko <mbucko@meta.com>
Date: Tue, 28 May 2024 23:29:10 +0700
Subject: [PATCH 32/89] Add SBAddressRange and SBAddressRangeList to SB API
 (#92014)

This adds new SB API calls and classes to allow a user of the SB API to obtain an address ranges from SBFunction and SBBlock.
---
 lldb/bindings/headers.swig                    |   2 +
 .../interface/SBAddressRangeDocstrings.i      |   3 +
 .../interface/SBAddressRangeExtensions.i      |  11 +
 .../interface/SBAddressRangeListDocstrings.i  |   3 +
 .../interface/SBAddressRangeListExtensions.i  |  29 ++
 lldb/bindings/interfaces.swig                 |   6 +
 lldb/include/lldb/API/LLDB.h                  |   2 +
 lldb/include/lldb/API/SBAddress.h             |   1 +
 lldb/include/lldb/API/SBAddressRange.h        |  66 +++++
 lldb/include/lldb/API/SBAddressRangeList.h    |  54 ++++
 lldb/include/lldb/API/SBBlock.h               |   4 +
 lldb/include/lldb/API/SBDefines.h             |   2 +
 lldb/include/lldb/API/SBFunction.h            |   3 +
 lldb/include/lldb/API/SBStream.h              |   2 +
 lldb/include/lldb/API/SBTarget.h              |   1 +
 lldb/include/lldb/Core/AddressRange.h         |  14 +
 lldb/include/lldb/Core/AddressRangeListImpl.h |  51 ++++
 lldb/include/lldb/Symbol/Block.h              |   2 +
 lldb/include/lldb/lldb-forward.h              |   3 +
 lldb/source/API/CMakeLists.txt                |   2 +
 lldb/source/API/SBAddressRange.cpp            | 103 +++++++
 lldb/source/API/SBAddressRangeList.cpp        |  94 +++++++
 lldb/source/API/SBBlock.cpp                   |  10 +
 lldb/source/API/SBFunction.cpp                |  14 +
 lldb/source/Core/AddressRange.cpp             |  43 +++
 lldb/source/Core/AddressRangeListImpl.cpp     |  50 ++++
 lldb/source/Core/CMakeLists.txt               |   1 +
 lldb/source/Symbol/Block.cpp                  |  16 ++
 .../API/python_api/address_range/Makefile     |   3 +
 .../address_range/TestAddressRange.py         | 256 ++++++++++++++++++
 .../API/python_api/address_range/main.cpp     |   8 +
 31 files changed, 859 insertions(+)
 create mode 100644 lldb/bindings/interface/SBAddressRangeDocstrings.i
 create mode 100644 lldb/bindings/interface/SBAddressRangeExtensions.i
 create mode 100644 lldb/bindings/interface/SBAddressRangeListDocstrings.i
 create mode 100644 lldb/bindings/interface/SBAddressRangeListExtensions.i
 create mode 100644 lldb/include/lldb/API/SBAddressRange.h
 create mode 100644 lldb/include/lldb/API/SBAddressRangeList.h
 create mode 100644 lldb/include/lldb/Core/AddressRangeListImpl.h
 create mode 100644 lldb/source/API/SBAddressRange.cpp
 create mode 100644 lldb/source/API/SBAddressRangeList.cpp
 create mode 100644 lldb/source/Core/AddressRangeListImpl.cpp
 create mode 100644 lldb/test/API/python_api/address_range/Makefile
 create mode 100644 lldb/test/API/python_api/address_range/TestAddressRange.py
 create mode 100644 lldb/test/API/python_api/address_range/main.cpp

diff --git a/lldb/bindings/headers.swig b/lldb/bindings/headers.swig
index ffdc3c31ec883a..c91504604b6ac6 100644
--- a/lldb/bindings/headers.swig
+++ b/lldb/bindings/headers.swig
@@ -8,6 +8,8 @@
 %{
 #include "lldb/lldb-public.h"
 #include "lldb/API/SBAddress.h"
+#include "lldb/API/SBAddressRange.h"
+#include "lldb/API/SBAddressRangeList.h"
 #include "lldb/API/SBAttachInfo.h"
 #include "lldb/API/SBBlock.h"
 #include "lldb/API/SBBreakpoint.h"
diff --git a/lldb/bindings/interface/SBAddressRangeDocstrings.i b/lldb/bindings/interface/SBAddressRangeDocstrings.i
new file mode 100644
index 00000000000000..650195704d73e6
--- /dev/null
+++ b/lldb/bindings/interface/SBAddressRangeDocstrings.i
@@ -0,0 +1,3 @@
+%feature("docstring",
+"API clients can get address range information."
+) lldb::SBAddressRange;
diff --git a/lldb/bindings/interface/SBAddressRangeExtensions.i b/lldb/bindings/interface/SBAddressRangeExtensions.i
new file mode 100644
index 00000000000000..31bcfcb64590bc
--- /dev/null
+++ b/lldb/bindings/interface/SBAddressRangeExtensions.i
@@ -0,0 +1,11 @@
+%extend lldb::SBAddressRange {
+#ifdef SWIGPYTHON
+    %pythoncode%{
+      def __repr__(self):
+        import lldb
+        stream = lldb.SBStream()
+        self.GetDescription(stream, lldb.target if lldb.target else lldb.SBTarget())
+        return stream.GetData()
+    %}
+#endif
+}
diff --git a/lldb/bindings/interface/SBAddressRangeListDocstrings.i b/lldb/bindings/interface/SBAddressRangeListDocstrings.i
new file mode 100644
index 00000000000000..e4b96b9ca59312
--- /dev/null
+++ b/lldb/bindings/interface/SBAddressRangeListDocstrings.i
@@ -0,0 +1,3 @@
+%feature("docstring",
+"Represents a list of :py:class:`SBAddressRange`."
+) lldb::SBAddressRangeList;
diff --git a/lldb/bindings/interface/SBAddressRangeListExtensions.i b/lldb/bindings/interface/SBAddressRangeListExtensions.i
new file mode 100644
index 00000000000000..e281a84d73d27d
--- /dev/null
+++ b/lldb/bindings/interface/SBAddressRangeListExtensions.i
@@ -0,0 +1,29 @@
+%extend lldb::SBAddressRangeList {
+#ifdef SWIGPYTHON
+    %pythoncode%{
+    def __len__(self):
+      '''Return the number of address ranges in a lldb.SBAddressRangeList object.'''
+      return self.GetSize()
+
+    def __iter__(self):
+      '''Iterate over all the address ranges in a lldb.SBAddressRangeList object.'''
+      return lldb_iter(self, 'GetSize', 'GetAddressRangeAtIndex')
+
+    def __getitem__(self, idx):
+      '''Get the address range at a given index in an lldb.SBAddressRangeList object.'''
+      if not isinstance(idx, int):
+        raise TypeError("unsupported index type: %s" % type(idx))
+      count = len(self)
+      if not (-count <= idx < count):
+        raise IndexError("list index out of range")
+      idx %= count
+      return self.GetAddressRangeAtIndex(idx)
+
+    def __repr__(self):
+      import lldb
+      stream = lldb.SBStream()
+      self.GetDescription(stream, lldb.target if lldb.target else lldb.SBTarget())
+      return stream.GetData()
+    %}
+#endif
+}
diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig
index 2a29a8dd7ef0b4..0953f4c72a9101 100644
--- a/lldb/bindings/interfaces.swig
+++ b/lldb/bindings/interfaces.swig
@@ -12,6 +12,8 @@
 
 /* Docstrings for SB classes and methods */
 %include "./interface/SBAddressDocstrings.i"
+%include "./interface/SBAddressRangeDocstrings.i"
+%include "./interface/SBAddressRangeListDocstrings.i"
 %include "./interface/SBAttachInfoDocstrings.i"
 %include "./interface/SBBlockDocstrings.i"
 %include "./interface/SBBreakpointDocstrings.i"
@@ -86,6 +88,8 @@
 
 /* API headers */
 %include "lldb/API/SBAddress.h"
+%include "lldb/API/SBAddressRange.h"
+%include "lldb/API/SBAddressRangeList.h"
 %include "lldb/API/SBAttachInfo.h"
 %include "lldb/API/SBBlock.h"
 %include "lldb/API/SBBreakpoint.h"
@@ -163,6 +167,8 @@
 
 /* Extensions for SB classes */
 %include "./interface/SBAddressExtensions.i"
+%include "./interface/SBAddressRangeExtensions.i"
+%include "./interface/SBAddressRangeListExtensions.i"
 %include "./interface/SBBlockExtensions.i"
 %include "./interface/SBBreakpointExtensions.i"
 %include "./interface/SBBreakpointListExtensions.i"
diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h
index b256544326a224..d8cc9f5067fe94 100644
--- a/lldb/include/lldb/API/LLDB.h
+++ b/lldb/include/lldb/API/LLDB.h
@@ -10,6 +10,8 @@
 #define LLDB_API_LLDB_H
 
 #include "lldb/API/SBAddress.h"
+#include "lldb/API/SBAddressRange.h"
+#include "lldb/API/SBAddressRangeList.h"
 #include "lldb/API/SBAttachInfo.h"
 #include "lldb/API/SBBlock.h"
 #include "lldb/API/SBBreakpoint.h"
diff --git a/lldb/include/lldb/API/SBAddress.h b/lldb/include/lldb/API/SBAddress.h
index 5e5f355ccc390c..430dad4862dbff 100644
--- a/lldb/include/lldb/API/SBAddress.h
+++ b/lldb/include/lldb/API/SBAddress.h
@@ -86,6 +86,7 @@ class LLDB_API SBAddress {
   lldb::SBLineEntry GetLineEntry();
 
 protected:
+  friend class SBAddressRange;
   friend class SBBlock;
   friend class SBBreakpoint;
   friend class SBBreakpointLocation;
diff --git a/lldb/include/lldb/API/SBAddressRange.h b/lldb/include/lldb/API/SBAddressRange.h
new file mode 100644
index 00000000000000..152bd82426af1c
--- /dev/null
+++ b/lldb/include/lldb/API/SBAddressRange.h
@@ -0,0 +1,66 @@
+//===-- SBAddressRange.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBADDRESSRANGE_H
+#define LLDB_API_SBADDRESSRANGE_H
+
+#include "lldb/API/SBDefines.h"
+
+namespace lldb {
+
+class LLDB_API SBAddressRange {
+public:
+  SBAddressRange();
+
+  SBAddressRange(const lldb::SBAddressRange &rhs);
+
+  SBAddressRange(lldb::SBAddress addr, lldb::addr_t byte_size);
+
+  ~SBAddressRange();
+
+  const lldb::SBAddressRange &operator=(const lldb::SBAddressRange &rhs);
+
+  void Clear();
+
+  /// Check the address range refers to a valid base address and has a byte
+  /// size greater than zero.
+  ///
+  /// \return
+  ///     True if the address range is valid, false otherwise.
+  bool IsValid() const;
+
+  /// Get the base address of the range.
+  ///
+  /// \return
+  ///     Base address object.
+  lldb::SBAddress GetBaseAddress() const;
+
+  /// Get the byte size of this range.
+  ///
+  /// \return
+  ///     The size in bytes of this address range.
+  lldb::addr_t GetByteSize() const;
+
+  bool operator==(const SBAddressRange &rhs);
+
+  bool operator!=(const SBAddressRange &rhs);
+
+  bool GetDescription(lldb::SBStream &description, const SBTarget target);
+
+private:
+  friend class SBAddressRangeList;
+  friend class SBBlock;
+  friend class SBFunction;
+  friend class SBProcess;
+
+  AddressRangeUP m_opaque_up;
+};
+
+} // namespace lldb
+
+#endif // LLDB_API_SBADDRESSRANGE_H
diff --git a/lldb/include/lldb/API/SBAddressRangeList.h b/lldb/include/lldb/API/SBAddressRangeList.h
new file mode 100644
index 00000000000000..a123287ef1b4fa
--- /dev/null
+++ b/lldb/include/lldb/API/SBAddressRangeList.h
@@ -0,0 +1,54 @@
+//===-- SBAddressRangeList.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBADDRESSRANGELIST_H
+#define LLDB_API_SBADDRESSRANGELIST_H
+
+#include <memory>
+
+#include "lldb/API/SBDefines.h"
+
+namespace lldb_private {
+class AddressRangeListImpl;
+}
+
+namespace lldb {
+
+class LLDB_API SBAddressRangeList {
+public:
+  SBAddressRangeList();
+
+  SBAddressRangeList(const lldb::SBAddressRangeList &rhs);
+
+  ~SBAddressRangeList();
+
+  const lldb::SBAddressRangeList &
+  operator=(const lldb::SBAddressRangeList &rhs);
+
+  uint32_t GetSize() const;
+
+  void Clear();
+
+  SBAddressRange GetAddressRangeAtIndex(uint64_t idx);
+
+  void Append(const lldb::SBAddressRange &addr_range);
+
+  void Append(const lldb::SBAddressRangeList &addr_range_list);
+
+  bool GetDescription(lldb::SBStream &description, const SBTarget &target);
+
+private:
+  friend class SBBlock;
+  friend class SBProcess;
+
+  std::unique_ptr<lldb_private::AddressRangeListImpl> m_opaque_up;
+};
+
+} // namespace lldb
+
+#endif // LLDB_API_SBADDRESSRANGELIST_H
diff --git a/lldb/include/lldb/API/SBBlock.h b/lldb/include/lldb/API/SBBlock.h
index 2570099f7652f3..de4bb22be26925 100644
--- a/lldb/include/lldb/API/SBBlock.h
+++ b/lldb/include/lldb/API/SBBlock.h
@@ -9,6 +9,8 @@
 #ifndef LLDB_API_SBBLOCK_H
 #define LLDB_API_SBBLOCK_H
 
+#include "lldb/API/SBAddressRange.h"
+#include "lldb/API/SBAddressRangeList.h"
 #include "lldb/API/SBDefines.h"
 #include "lldb/API/SBFrame.h"
 #include "lldb/API/SBTarget.h"
@@ -52,6 +54,8 @@ class LLDB_API SBBlock {
 
   lldb::SBAddress GetRangeEndAddress(uint32_t idx);
 
+  lldb::SBAddressRangeList GetRanges();
+
   uint32_t GetRangeIndexForBlockAddress(lldb::SBAddress block_addr);
 
   lldb::SBValueList GetVariables(lldb::SBFrame &frame, bool arguments,
diff --git a/lldb/include/lldb/API/SBDefines.h b/lldb/include/lldb/API/SBDefines.h
index 1181920677b46f..87c0a1c3661ca3 100644
--- a/lldb/include/lldb/API/SBDefines.h
+++ b/lldb/include/lldb/API/SBDefines.h
@@ -43,6 +43,8 @@
 namespace lldb {
 
 class LLDB_API SBAddress;
+class LLDB_API SBAddressRange;
+class LLDB_API SBAddressRangeList;
 class LLDB_API SBAttachInfo;
 class LLDB_API SBBlock;
 class LLDB_API SBBreakpoint;
diff --git a/lldb/include/lldb/API/SBFunction.h b/lldb/include/lldb/API/SBFunction.h
index 71b372a818e4b5..df607fdc7ebf59 100644
--- a/lldb/include/lldb/API/SBFunction.h
+++ b/lldb/include/lldb/API/SBFunction.h
@@ -10,6 +10,7 @@
 #define LLDB_API_SBFUNCTION_H
 
 #include "lldb/API/SBAddress.h"
+#include "lldb/API/SBAddressRangeList.h"
 #include "lldb/API/SBDefines.h"
 #include "lldb/API/SBInstructionList.h"
 
@@ -44,6 +45,8 @@ class LLDB_API SBFunction {
 
   lldb::SBAddress GetEndAddress();
 
+  lldb::SBAddressRangeList GetRanges();
+
   const char *GetArgumentName(uint32_t arg_idx);
 
   uint32_t GetPrologueByteSize();
diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h
index 0e33f05b69916f..71caf41fd75491 100644
--- a/lldb/include/lldb/API/SBStream.h
+++ b/lldb/include/lldb/API/SBStream.h
@@ -62,6 +62,8 @@ class LLDB_API SBStream {
 
 protected:
   friend class SBAddress;
+  friend class SBAddressRange;
+  friend class SBAddressRangeList;
   friend class SBBlock;
   friend class SBBreakpoint;
   friend class SBBreakpointLocation;
diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h
index feeaa1cb71132b..35c2ed9c20a238 100644
--- a/lldb/include/lldb/API/SBTarget.h
+++ b/lldb/include/lldb/API/SBTarget.h
@@ -943,6 +943,7 @@ class LLDB_API SBTarget {
 
 protected:
   friend class SBAddress;
+  friend class SBAddressRange;
   friend class SBBlock;
   friend class SBBreakpoint;
   friend class SBBreakpointList;
diff --git a/lldb/include/lldb/Core/AddressRange.h b/lldb/include/lldb/Core/AddressRange.h
index 4a33c2d7958765..68a3ad0edd2d79 100644
--- a/lldb/include/lldb/Core/AddressRange.h
+++ b/lldb/include/lldb/Core/AddressRange.h
@@ -86,6 +86,8 @@ class AddressRange {
   /// (LLDB_INVALID_ADDRESS) and a zero byte size.
   void Clear();
 
+  bool IsValid() const;
+
   /// Check if a section offset address is contained in this range.
   ///
   /// \param[in] so_addr
@@ -236,12 +238,24 @@ class AddressRange {
   ///     The new size in bytes of this address range.
   void SetByteSize(lldb::addr_t byte_size) { m_byte_size = byte_size; }
 
+  bool GetDescription(Stream *s, Target *target) const;
+
+  bool operator==(const AddressRange &rhs);
+
+  bool operator!=(const AddressRange &rhs);
+
 protected:
   // Member variables
   Address m_base_addr;      ///< The section offset base address of this range.
   lldb::addr_t m_byte_size = 0; ///< The size in bytes of this address range.
 };
 
+// Forward-declarable wrapper.
+class AddressRanges : public std::vector<lldb_private::AddressRange> {
+public:
+  using std::vector<lldb_private::AddressRange>::vector;
+};
+
 } // namespace lldb_private
 
 #endif // LLDB_CORE_ADDRESSRANGE_H
diff --git a/lldb/include/lldb/Core/AddressRangeListImpl.h b/lldb/include/lldb/Core/AddressRangeListImpl.h
new file mode 100644
index 00000000000000..46ebfe73d4d92d
--- /dev/null
+++ b/lldb/include/lldb/Core/AddressRangeListImpl.h
@@ -0,0 +1,51 @@
+//===-- AddressRangeListImpl.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_CORE_ADDRESSRANGELISTIMPL_H
+#define LLDB_CORE_ADDRESSRANGELISTIMPL_H
+
+#include "lldb/Core/AddressRange.h"
+#include <cstddef>
+
+namespace lldb {
+class SBBlock;
+}
+
+namespace lldb_private {
+
+class AddressRangeListImpl {
+public:
+  AddressRangeListImpl();
+
+  AddressRangeListImpl(const AddressRangeListImpl &rhs) = default;
+
+  AddressRangeListImpl &operator=(const AddressRangeListImpl &rhs);
+
+  size_t GetSize() const;
+
+  void Reserve(size_t capacity);
+
+  void Append(const AddressRange &sb_region);
+
+  void Append(const AddressRangeListImpl &list);
+
+  void Clear();
+
+  lldb_private::AddressRange GetAddressRangeAtIndex(size_t index);
+
+private:
+  friend class lldb::SBBlock;
+
+  AddressRanges &ref();
+
+  AddressRanges m_ranges;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_CORE_ADDRESSRANGE_H
diff --git a/lldb/include/lldb/Symbol/Block.h b/lldb/include/lldb/Symbol/Block.h
index 02fd2add531033..c9c4d5ad767d7e 100644
--- a/lldb/include/lldb/Symbol/Block.h
+++ b/lldb/include/lldb/Symbol/Block.h
@@ -355,6 +355,8 @@ class Block : public UserID, public SymbolContextScope {
   // be able to get at any of the address ranges in a block.
   bool GetRangeAtIndex(uint32_t range_idx, AddressRange &range);
 
+  AddressRanges GetRanges();
+
   bool GetStartAddress(Address &addr);
 
   void SetDidParseVariables(bool b, bool set_children);
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 10ba921b9dac8c..6d880b4da03c99 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -19,6 +19,8 @@ class ASTResultSynthesizer;
 class ASTStructExtractor;
 class Address;
 class AddressRange;
+class AddressRanges;
+class AddressRangeList;
 class AddressResolver;
 class ArchSpec;
 class Architecture;
@@ -308,6 +310,7 @@ template <unsigned N> class StreamBuffer;
 namespace lldb {
 
 typedef std::shared_ptr<lldb_private::ABI> ABISP;
+typedef std::unique_ptr<lldb_private::AddressRange> AddressRangeUP;
 typedef std::shared_ptr<lldb_private::Baton> BatonSP;
 typedef std::shared_ptr<lldb_private::Block> BlockSP;
 typedef std::shared_ptr<lldb_private::Breakpoint> BreakpointSP;
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index e8228afe103f9c..63971016093151 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -42,6 +42,8 @@ set_target_properties(lldb-sbapi-dwarf-enums PROPERTIES FOLDER "LLDB/Tablegennin
 
 add_lldb_library(liblldb SHARED ${option_framework}
   SBAddress.cpp
+  SBAddressRange.cpp
+  SBAddressRangeList.cpp
   SBAttachInfo.cpp
   SBBlock.cpp
   SBBreakpoint.cpp
diff --git a/lldb/source/API/SBAddressRange.cpp b/lldb/source/API/SBAddressRange.cpp
new file mode 100644
index 00000000000000..9b1affdade439c
--- /dev/null
+++ b/lldb/source/API/SBAddressRange.cpp
@@ -0,0 +1,103 @@
+//===-- SBAddressRange.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/API/SBAddressRange.h"
+#include "Utils.h"
+#include "lldb/API/SBAddress.h"
+#include "lldb/API/SBStream.h"
+#include "lldb/API/SBTarget.h"
+#include "lldb/Core/AddressRange.h"
+#include "lldb/Core/Section.h"
+#include "lldb/Utility/Instrumentation.h"
+#include "lldb/Utility/Stream.h"
+#include <cstddef>
+#include <memory>
+
+using namespace lldb;
+using namespace lldb_private;
+
+SBAddressRange::SBAddressRange()
+    : m_opaque_up(std::make_unique<AddressRange>()) {
+  LLDB_INSTRUMENT_VA(this);
+}
+
+SBAddressRange::SBAddressRange(const SBAddressRange &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  m_opaque_up = clone(rhs.m_opaque_up);
+}
+
+SBAddressRange::SBAddressRange(lldb::SBAddress addr, lldb::addr_t byte_size)
+    : m_opaque_up(std::make_unique<AddressRange>(addr.ref(), byte_size)) {
+  LLDB_INSTRUMENT_VA(this, addr, byte_size);
+}
+
+SBAddressRange::~SBAddressRange() = default;
+
+const SBAddressRange &SBAddressRange::operator=(const SBAddressRange &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  if (this != &rhs)
+    m_opaque_up = clone(rhs.m_opaque_up);
+  return *this;
+}
+
+bool SBAddressRange::operator==(const SBAddressRange &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  if (!IsValid() || !rhs.IsValid())
+    return false;
+  return m_opaque_up->operator==(*(rhs.m_opaque_up));
+}
+
+bool SBAddressRange::operator!=(const SBAddressRange &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  return !(*this == rhs);
+}
+
+void SBAddressRange::Clear() {
+  LLDB_INSTRUMENT_VA(this);
+
+  m_opaque_up.reset();
+}
+
+bool SBAddressRange::IsValid() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  return m_opaque_up && m_opaque_up->IsValid();
+}
+
+lldb::SBAddress SBAddressRange::GetBaseAddress() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (!IsValid())
+    return lldb::SBAddress();
+  return lldb::SBAddress(m_opaque_up->GetBaseAddress());
+}
+
+lldb::addr_t SBAddressRange::GetByteSize() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (!IsValid())
+    return 0;
+  return m_opaque_up->GetByteSize();
+}
+
+bool SBAddressRange::GetDescription(SBStream &description,
+                                    const SBTarget target) {
+  LLDB_INSTRUMENT_VA(this, description, target);
+
+  Stream &stream = description.ref();
+  if (!IsValid()) {
+    stream << "<invalid>";
+    return true;
+  }
+  m_opaque_up->GetDescription(&stream, target.GetSP().get());
+  return true;
+}
diff --git a/lldb/source/API/SBAddressRangeList.cpp b/lldb/source/API/SBAddressRangeList.cpp
new file mode 100644
index 00000000000000..20660b3ff20882
--- /dev/null
+++ b/lldb/source/API/SBAddressRangeList.cpp
@@ -0,0 +1,94 @@
+//===-- SBAddressRangeList.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/API/SBAddressRangeList.h"
+#include "Utils.h"
+#include "lldb/API/SBAddressRange.h"
+#include "lldb/API/SBStream.h"
+#include "lldb/API/SBTarget.h"
+#include "lldb/Core/AddressRangeListImpl.h"
+#include "lldb/Utility/Instrumentation.h"
+#include "lldb/Utility/Stream.h"
+
+#include <memory>
+
+using namespace lldb;
+using namespace lldb_private;
+
+SBAddressRangeList::SBAddressRangeList()
+    : m_opaque_up(std::make_unique<AddressRangeListImpl>()) {
+  LLDB_INSTRUMENT_VA(this);
+}
+
+SBAddressRangeList::SBAddressRangeList(const SBAddressRangeList &rhs)
+    : m_opaque_up(std::make_unique<AddressRangeListImpl>(*rhs.m_opaque_up)) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+}
+
+SBAddressRangeList::~SBAddressRangeList() = default;
+
+const SBAddressRangeList &
+SBAddressRangeList::operator=(const SBAddressRangeList &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  if (this != &rhs)
+    *m_opaque_up = *rhs.m_opaque_up;
+  return *this;
+}
+
+uint32_t SBAddressRangeList::GetSize() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  return m_opaque_up->GetSize();
+}
+
+SBAddressRange SBAddressRangeList::GetAddressRangeAtIndex(uint64_t idx) {
+  LLDB_INSTRUMENT_VA(this, idx);
+
+  SBAddressRange sb_addr_range;
+  (*sb_addr_range.m_opaque_up) = m_opaque_up->GetAddressRangeAtIndex(idx);
+  return sb_addr_range;
+}
+
+void SBAddressRangeList::Clear() {
+  LLDB_INSTRUMENT_VA(this);
+
+  m_opaque_up->Clear();
+}
+
+void SBAddressRangeList::Append(const SBAddressRange &sb_addr_range) {
+  LLDB_INSTRUMENT_VA(this, sb_addr_range);
+
+  m_opaque_up->Append(*sb_addr_range.m_opaque_up);
+}
+
+void SBAddressRangeList::Append(const SBAddressRangeList &sb_addr_range_list) {
+  LLDB_INSTRUMENT_VA(this, sb_addr_range_list);
+
+  m_opaque_up->Append(*sb_addr_range_list.m_opaque_up);
+}
+
+bool SBAddressRangeList::GetDescription(SBStream &description,
+                                        const SBTarget &target) {
+  LLDB_INSTRUMENT_VA(this, description, target);
+
+  const uint32_t num_ranges = GetSize();
+  bool is_first = true;
+  Stream &stream = description.ref();
+  stream << "[";
+  for (uint32_t i = 0; i < num_ranges; ++i) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      stream.Printf(", ");
+    }
+    GetAddressRangeAtIndex(i).GetDescription(description, target);
+  }
+  stream << "]";
+  return true;
+}
diff --git a/lldb/source/API/SBBlock.cpp b/lldb/source/API/SBBlock.cpp
index 7d7565340836b1..2577b14920f065 100644
--- a/lldb/source/API/SBBlock.cpp
+++ b/lldb/source/API/SBBlock.cpp
@@ -13,6 +13,7 @@
 #include "lldb/API/SBStream.h"
 #include "lldb/API/SBValue.h"
 #include "lldb/Core/AddressRange.h"
+#include "lldb/Core/AddressRangeListImpl.h"
 #include "lldb/Core/ValueObjectVariable.h"
 #include "lldb/Symbol/Block.h"
 #include "lldb/Symbol/Function.h"
@@ -219,6 +220,15 @@ lldb::SBAddress SBBlock::GetRangeEndAddress(uint32_t idx) {
   return sb_addr;
 }
 
+lldb::SBAddressRangeList SBBlock::GetRanges() {
+  LLDB_INSTRUMENT_VA(this);
+
+  lldb::SBAddressRangeList sb_ranges;
+  if (m_opaque_ptr)
+    sb_ranges.m_opaque_up->ref() = m_opaque_ptr->GetRanges();
+  return sb_ranges;
+}
+
 uint32_t SBBlock::GetRangeIndexForBlockAddress(lldb::SBAddress block_addr) {
   LLDB_INSTRUMENT_VA(this, block_addr);
 
diff --git a/lldb/source/API/SBFunction.cpp b/lldb/source/API/SBFunction.cpp
index a01c7f79bbd31f..6a97352fc2c2fd 100644
--- a/lldb/source/API/SBFunction.cpp
+++ b/lldb/source/API/SBFunction.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/API/SBFunction.h"
+#include "lldb/API/SBAddressRange.h"
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBStream.h"
 #include "lldb/Core/Disassembler.h"
@@ -160,6 +161,19 @@ SBAddress SBFunction::GetEndAddress() {
   return addr;
 }
 
+lldb::SBAddressRangeList SBFunction::GetRanges() {
+  LLDB_INSTRUMENT_VA(this);
+
+  lldb::SBAddressRangeList ranges;
+  if (m_opaque_ptr) {
+    lldb::SBAddressRange range;
+    (*range.m_opaque_up) = m_opaque_ptr->GetAddressRange();
+    ranges.Append(std::move(range));
+  }
+
+  return ranges;
+}
+
 const char *SBFunction::GetArgumentName(uint32_t arg_idx) {
   LLDB_INSTRUMENT_VA(this, arg_idx);
 
diff --git a/lldb/source/Core/AddressRange.cpp b/lldb/source/Core/AddressRange.cpp
index 1830f2ccd47fec..6cef7e149cd20b 100644
--- a/lldb/source/Core/AddressRange.cpp
+++ b/lldb/source/Core/AddressRange.cpp
@@ -14,6 +14,7 @@
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/lldb-defines.h"
+#include "lldb/lldb-types.h"
 
 #include "llvm/Support/Compiler.h"
 
@@ -145,6 +146,10 @@ void AddressRange::Clear() {
   m_byte_size = 0;
 }
 
+bool AddressRange::IsValid() const {
+  return m_base_addr.IsValid() && (m_byte_size > 0);
+}
+
 bool AddressRange::Dump(Stream *s, Target *target, Address::DumpStyle style,
                         Address::DumpStyle fallback_style) const {
   addr_t vmaddr = LLDB_INVALID_ADDRESS;
@@ -203,3 +208,41 @@ void AddressRange::DumpDebug(Stream *s) const {
             static_cast<void *>(m_base_addr.GetSection().get()),
             m_base_addr.GetOffset(), GetByteSize());
 }
+
+bool AddressRange::GetDescription(Stream *s, Target *target) const {
+  addr_t start_addr = m_base_addr.GetLoadAddress(target);
+  if (start_addr != LLDB_INVALID_ADDRESS) {
+    // We have a valid target and the address was resolved, or we have a base
+    // address with no section. Just print out a raw address range: [<addr>,
+    // <addr>)
+    s->Printf("[0x%" PRIx64 "-0x%" PRIx64 ")", start_addr,
+              start_addr + GetByteSize());
+    return true;
+  }
+
+  // Either no target or the address wasn't resolved, print as
+  // <module>[<file-addr>-<file-addr>)
+  const char *file_name = "";
+  const auto section_sp = m_base_addr.GetSection();
+  if (section_sp) {
+    if (const auto object_file = section_sp->GetObjectFile())
+      file_name = object_file->GetFileSpec().GetFilename().AsCString();
+  }
+  start_addr = m_base_addr.GetFileAddress();
+  const addr_t end_addr = (start_addr == LLDB_INVALID_ADDRESS)
+                              ? LLDB_INVALID_ADDRESS
+                              : start_addr + GetByteSize();
+  s->Printf("%s[0x%" PRIx64 "-0x%" PRIx64 ")", file_name, start_addr, end_addr);
+  return true;
+}
+
+bool AddressRange::operator==(const AddressRange &rhs) {
+  if (!IsValid() || !rhs.IsValid())
+    return false;
+  return m_base_addr == rhs.GetBaseAddress() &&
+         m_byte_size == rhs.GetByteSize();
+}
+
+bool AddressRange::operator!=(const AddressRange &rhs) {
+  return !(*this == rhs);
+}
diff --git a/lldb/source/Core/AddressRangeListImpl.cpp b/lldb/source/Core/AddressRangeListImpl.cpp
new file mode 100644
index 00000000000000..d405cf0fa3ec35
--- /dev/null
+++ b/lldb/source/Core/AddressRangeListImpl.cpp
@@ -0,0 +1,50 @@
+//===-- AddressRangeListImpl.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Core/AddressRangeListImpl.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+AddressRangeListImpl::AddressRangeListImpl() : m_ranges() {}
+
+AddressRangeListImpl &
+AddressRangeListImpl::operator=(const AddressRangeListImpl &rhs) {
+  if (this == &rhs)
+    return *this;
+  m_ranges = rhs.m_ranges;
+  return *this;
+}
+
+size_t AddressRangeListImpl::GetSize() const { return m_ranges.size(); }
+
+void AddressRangeListImpl::Reserve(size_t capacity) {
+  m_ranges.reserve(capacity);
+}
+
+void AddressRangeListImpl::Append(const AddressRange &sb_region) {
+  m_ranges.emplace_back(sb_region);
+}
+
+void AddressRangeListImpl::Append(const AddressRangeListImpl &list) {
+  Reserve(GetSize() + list.GetSize());
+
+  for (const auto &range : list.m_ranges)
+    Append(range);
+}
+
+void AddressRangeListImpl::Clear() { m_ranges.clear(); }
+
+lldb_private::AddressRange
+AddressRangeListImpl::GetAddressRangeAtIndex(size_t index) {
+  if (index >= GetSize())
+    return AddressRange();
+  return m_ranges[index];
+}
+
+AddressRanges &AddressRangeListImpl::ref() { return m_ranges; }
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index f24dbbd45a8e8c..dbc620b91b1ed1 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -20,6 +20,7 @@ endif()
 add_lldb_library(lldbCore
   Address.cpp
   AddressRange.cpp
+  AddressRangeListImpl.cpp
   AddressResolver.cpp
   AddressResolverFileLine.cpp
   Communication.cpp
diff --git a/lldb/source/Symbol/Block.cpp b/lldb/source/Symbol/Block.cpp
index 6eeabe0ff5e4d0..f7d9c0d2d33065 100644
--- a/lldb/source/Symbol/Block.cpp
+++ b/lldb/source/Symbol/Block.cpp
@@ -314,6 +314,22 @@ bool Block::GetRangeAtIndex(uint32_t range_idx, AddressRange &range) {
   return false;
 }
 
+AddressRanges Block::GetRanges() {
+  AddressRanges ranges;
+  Function *function = CalculateSymbolContextFunction();
+  if (!function)
+    return ranges;
+  for (size_t i = 0, e = m_ranges.GetSize(); i < e; ++i) {
+    ranges.emplace_back();
+    auto &range = ranges.back();
+    const Range &vm_range = m_ranges.GetEntryRef(i);
+    range.GetBaseAddress() = function->GetAddressRange().GetBaseAddress();
+    range.GetBaseAddress().Slide(vm_range.GetRangeBase());
+    range.SetByteSize(vm_range.GetByteSize());
+  }
+  return ranges;
+}
+
 bool Block::GetStartAddress(Address &addr) {
   if (m_ranges.IsEmpty())
     return false;
diff --git a/lldb/test/API/python_api/address_range/Makefile b/lldb/test/API/python_api/address_range/Makefile
new file mode 100644
index 00000000000000..99998b20bcb050
--- /dev/null
+++ b/lldb/test/API/python_api/address_range/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/python_api/address_range/TestAddressRange.py b/lldb/test/API/python_api/address_range/TestAddressRange.py
new file mode 100644
index 00000000000000..8c27558af4752d
--- /dev/null
+++ b/lldb/test/API/python_api/address_range/TestAddressRange.py
@@ -0,0 +1,256 @@
+"""
+Test SBAddressRange APIs.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+
+
+class AddressRangeTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def setUp(self):
+        TestBase.setUp(self)
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+
+        self.dbg.SetAsync(True)
+
+        self.target = self.dbg.CreateTarget(exe)
+        self.assertTrue(self.target, VALID_TARGET)
+        self.launch_info = self.target.GetLaunchInfo()
+        self.launch_info.SetWorkingDirectory(self.get_process_working_directory())
+
+        self.bp1 = self.target.BreakpointCreateByName("main", "a.out")
+        self.bp2 = self.target.BreakpointCreateByName("foo", "a.out")
+        self.bp3 = self.target.BreakpointCreateByName("bar", "a.out")
+
+        self.assertTrue(self.bp1.IsValid())
+        self.assertTrue(self.bp2.IsValid())
+        self.assertTrue(self.bp3.IsValid())
+
+        self.addr1 = self.bp1.GetLocationAtIndex(0).GetAddress()
+        self.addr2 = self.bp2.GetLocationAtIndex(0).GetAddress()
+        self.addr3 = self.bp3.GetLocationAtIndex(0).GetAddress()
+
+        self.assertTrue(self.addr1.IsValid())
+        self.assertTrue(self.addr2.IsValid())
+        self.assertTrue(self.addr3.IsValid())
+
+    def test_address_range_default(self):
+        """Testing default constructor."""
+        empty_range = lldb.SBAddressRange()
+        self.assertEqual(empty_range.IsValid(), False)
+
+    def test_address_range_construction(self):
+        """Make sure the construction and getters work."""
+        range = lldb.SBAddressRange(self.addr1, 8)
+        self.assertEqual(range.IsValid(), True)
+        self.assertEqual(range.GetBaseAddress(), self.addr1)
+        self.assertEqual(range.GetByteSize(), 8)
+
+    def test_address_range_clear(self):
+        """Make sure the clear method works."""
+        range = lldb.SBAddressRange(self.addr1, 8)
+        self.assertEqual(range.IsValid(), True)
+        self.assertEqual(range.GetBaseAddress(), self.addr1)
+        self.assertEqual(range.GetByteSize(), 8)
+
+        range.Clear()
+        self.assertEqual(range.IsValid(), False)
+
+    def test_function(self):
+        """Make sure the range works in SBFunction APIs."""
+
+        # Setup breakpoints in main
+        loc = self.bp1.GetLocationAtIndex(0)
+        loc_addr = loc.GetAddress()
+        func = loc_addr.GetFunction()
+        ranges = func.GetRanges()
+        self.assertEqual(ranges.GetSize(), 1)
+
+        range = ranges.GetAddressRangeAtIndex(0)
+        self.assertEqual(
+            range.GetByteSize(),
+            func.GetEndAddress().GetOffset() - func.GetStartAddress().GetOffset(),
+        )
+        self.assertEqual(
+            range.GetBaseAddress().GetOffset(),
+            func.GetStartAddress().GetOffset(),
+        )
+
+    def test_block(self):
+        """Make sure the range works in SBBlock APIs."""
+        loc = self.bp1.GetLocationAtIndex(0)
+        loc_addr = loc.GetAddress()
+        block = loc_addr.GetBlock()
+
+        ranges = block.GetRanges()
+        self.assertEqual(ranges.GetSize(), 1)
+
+        range = ranges.GetAddressRangeAtIndex(0)
+        self.assertEqual(
+            range.GetByteSize(),
+            block.GetRangeEndAddress(0).GetOffset()
+            - block.GetRangeStartAddress(0).GetOffset(),
+        )
+        self.assertEqual(
+            range.GetBaseAddress().GetOffset(),
+            block.GetRangeStartAddress(0).GetOffset(),
+        )
+
+    def test_address_range_list(self):
+        """Make sure the SBAddressRangeList works by adding and getting ranges."""
+        range1 = lldb.SBAddressRange(self.addr1, 8)
+        range2 = lldb.SBAddressRange(self.addr2, 16)
+        range3 = lldb.SBAddressRange(self.addr3, 32)
+
+        range_list = lldb.SBAddressRangeList()
+        self.assertEqual(range_list.GetSize(), 0)
+
+        range_list.Append(range1)
+        range_list.Append(range2)
+        range_list.Append(range3)
+        self.assertEqual(range_list.GetSize(), 3)
+        self.assertRaises(IndexError, lambda: range_list[3])
+
+        range1_copy = range_list.GetAddressRangeAtIndex(0)
+        self.assertEqual(range1.GetByteSize(), range1_copy.GetByteSize())
+        self.assertEqual(
+            range1.GetBaseAddress().GetOffset(),
+            range1_copy.GetBaseAddress().GetOffset(),
+        )
+
+        range2_copy = range_list.GetAddressRangeAtIndex(1)
+        self.assertEqual(range2.GetByteSize(), range2_copy.GetByteSize())
+        self.assertEqual(
+            range2.GetBaseAddress().GetOffset(),
+            range2_copy.GetBaseAddress().GetOffset(),
+        )
+
+        range3_copy = range_list.GetAddressRangeAtIndex(2)
+        self.assertEqual(range3.GetByteSize(), range3_copy.GetByteSize())
+        self.assertEqual(
+            range3.GetBaseAddress().GetOffset(),
+            range3_copy.GetBaseAddress().GetOffset(),
+        )
+
+        range_list.Clear()
+        self.assertEqual(range_list.GetSize(), 0)
+
+    def test_address_range_list_len(self):
+        """Make sure the len() operator works."""
+        range = lldb.SBAddressRange(self.addr1, 8)
+
+        range_list = lldb.SBAddressRangeList()
+        self.assertEqual(len(range_list), 0)
+
+        range_list.Append(range)
+        self.assertEqual(len(range_list), 1)
+
+    def test_address_range_list_iterator(self):
+        """Make sure the SBAddressRangeList iterator works."""
+        range1 = lldb.SBAddressRange(self.addr1, 8)
+        range2 = lldb.SBAddressRange(self.addr2, 16)
+        range3 = lldb.SBAddressRange(self.addr3, 32)
+
+        range_list = lldb.SBAddressRangeList()
+        range_list.Append(range1)
+        range_list.Append(range2)
+        range_list.Append(range3)
+        self.assertEqual(range_list.GetSize(), 3)
+
+        # Test the iterator
+        for range in range_list:
+            self.assertTrue(range.IsValid())
+
+    def test_address_range_print_invalid(self):
+        """Make sure the SBAddressRange can be printed when invalid."""
+        range = lldb.SBAddressRange()
+        self.assertEqual(str(range), "<invalid>")
+
+    def test_address_range_print_resolved(self):
+        """Make sure the SBAddressRange can be printed when resolved."""
+        lldb.target = self.target
+        error = lldb.SBError()
+        process = self.target.Launch(self.launch_info, error)
+        self.assertTrue(error.Success(), "Make sure process launched successfully")
+        self.assertTrue(process, PROCESS_IS_VALID)
+        self.assertState(process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        loc = self.bp1.GetLocationAtIndex(0)
+        loc_addr = loc.GetAddress()
+        func = loc_addr.GetFunction()
+        range = func.GetRanges().GetAddressRangeAtIndex(0)
+        range_str = str(range)
+        # [0x1000-0x2000] // Resolved with target or addresses without sections
+        self.assertRegex(range_str, "^\[0x[0-9a-f]+\-0x[0-9a-f]+\)$")
+        process.Kill()
+
+    def test_address_range_print_no_section_resolved(self):
+        """Make sure the SBAddressRange can be printed with no secion."""
+        lldb.target = self.target
+        error = lldb.SBError()
+        process = self.target.Launch(self.launch_info, error)
+        self.assertTrue(error.Success(), "Make sure process launched successfully")
+        self.assertTrue(process, PROCESS_IS_VALID)
+        self.assertState(process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        loc = self.bp1.GetLocationAtIndex(0)
+        loc_addr = loc.GetAddress()
+        func = loc_addr.GetFunction()
+        range = func.GetRanges().GetAddressRangeAtIndex(0)
+
+        addr = lldb.SBAddress()
+        addr.SetAddress(lldb.SBSection(), range.GetBaseAddress().GetOffset())
+        self.assertFalse(addr.GetSection().IsValid())
+        range = lldb.SBAddressRange(addr, range.GetByteSize())
+
+        range_str = str(range)
+        # [0x1000-0x2000] // Resolved with target or addresses without sections
+        self.assertRegex(range_str, "^\[0x[0-9a-f]+\-0x[0-9a-f]+\)$")
+        process.Kill()
+
+    def test_address_range_print_not_resolved(self):
+        """Make sure the SBAddressRange can be printed when not resolved."""
+        range = lldb.SBAddressRange(self.addr1, 8)
+        range_str = str(range)
+        # a.out[0x1000-0x2000] // Without target
+        self.assertRegex(range_str, "^a.out\[0x[0-9a-f]+\-0x[0-9a-f]+\)$")
+
+    def test_address_range_list_print(self):
+        """Make sure the SBAddressRangeList can be printed."""
+        range1 = lldb.SBAddressRange(self.addr1, 8)
+        range2 = lldb.SBAddressRange(self.addr2, 16)
+        range3 = lldb.SBAddressRange(self.addr3, 32)
+        self.dbg.SetAsync(True)
+
+        range_list = lldb.SBAddressRangeList()
+        self.assertEqual(range_list.GetSize(), 0)
+
+        range_list.Append(range1)
+        range_list.Append(range2)
+        range_list.Append(range3)
+        self.assertEqual(range_list.GetSize(), 3)
+
+        range_list_str = str(range_list)
+        self.assertTrue(range_list_str.startswith("["))
+        self.assertGreater(range_list_str.count(","), 1)
+        self.assertTrue(range_list_str.endswith("]"))
+
+    def test_address_range_list_indexing(self):
+        """Make sure the SBAddressRangeList can be printed."""
+        range1 = lldb.SBAddressRange(self.addr1, 8)
+        range2 = lldb.SBAddressRange(self.addr2, 16)
+        range_list = lldb.SBAddressRangeList()
+        range_list.Append(range1)
+        range_list.Append(range2)
+
+        self.assertEqual(range_list.GetSize(), 2)
+        self.assertRaises(IndexError, lambda: range_list[2])
+        self.assertRaises(TypeError, lambda: range_list["0"])
+        self.assertEqual(range_list[0], range1)
+        self.assertEqual(range_list[1], range2)
+        self.assertEqual(range_list[-1], range2)
+        self.assertEqual(range_list[-2], range1)
diff --git a/lldb/test/API/python_api/address_range/main.cpp b/lldb/test/API/python_api/address_range/main.cpp
new file mode 100644
index 00000000000000..b6eaec4a23699b
--- /dev/null
+++ b/lldb/test/API/python_api/address_range/main.cpp
@@ -0,0 +1,8 @@
+void foo() {}
+void bar() {}
+
+int main() {
+  foo();
+  bar();
+  return 0;
+}

From 79c7342f49f1ed7aa971e7857954b45906154943 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 28 May 2024 16:31:25 +0000
Subject: [PATCH 33/89] [gn build] Port 42944e460082

---
 llvm/utils/gn/secondary/lldb/source/API/BUILD.gn  | 2 ++
 llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn | 1 +
 2 files changed, 3 insertions(+)

diff --git a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
index c99c1b5483355b..f0bf6a8f3dbaf8 100644
--- a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
@@ -40,6 +40,8 @@ target(liblldb_type, "liblldb") {
   include_dirs = [ ".." ]
   sources = [
     "SBAddress.cpp",
+    "SBAddressRange.cpp",
+    "SBAddressRangeList.cpp",
     "SBAttachInfo.cpp",
     "SBBlock.cpp",
     "SBBreakpoint.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
index 30a9fb3ecceaa0..0c9632a0a1915f 100644
--- a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
@@ -45,6 +45,7 @@ static_library("Core") {
   sources = [
     "Address.cpp",
     "AddressRange.cpp",
+    "AddressRangeListImpl.cpp",
     "AddressResolver.cpp",
     "AddressResolverFileLine.cpp",
     "Communication.cpp",

From 7bea41e173367e2a535bd2188fd652a2ca267b90 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <r@artagnon.com>
Date: Tue, 28 May 2024 17:42:58 +0100
Subject: [PATCH 34/89] LoopIdiomRecognize: strip bad TODO (NFC) (#92890)

There are several reasons why handling powi in LoopIdiomRecognize is a
bad idea:

- powi corresponds to a GCC builtin that is only defined for C int
(which is i32 for most targets).
- powi isn't always lowered by targets correctly for non-i32 parameters.
Several targets fail to compile llvm.powi.f32.i16, for example.
- Unlike memcpy and memset, which tend to be important enough internal
intrinsics that you have to handle them correctly even in freestanding
modes, powi isn't.

Strip this bad TODO to avoid misleading contributors.
---
 llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index c7e25c9f3d2c92..3fe5478408d457 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -22,8 +22,6 @@
 //
 // Future loop memory idioms to recognize:
 //   memcmp, strlen, etc.
-// Future floating point idioms to recognize in -ffast-math mode:
-//   fpowi
 //
 // This could recognize common matrix multiplies and dot product idioms and
 // replace them with calls to BLAS (if linked in??).
@@ -1107,7 +1105,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     GV->setAlignment(Align(16));
     Value *PatternPtr = GV;
     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
-    
+
     // Set the TBAA info if present.
     if (AATags.TBAA)
       NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA);
@@ -1117,7 +1115,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 
     if (AATags.NoAlias)
       NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias);
-  } 
+  }
 
   NewCall->setDebugLoc(TheStore->getDebugLoc());
 

From 16a5fd3fdb91ffb39b97dbd3a7e9346ba406360d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 28 May 2024 18:57:38 +0200
Subject: [PATCH 35/89] DAG: Use flags in isLegalToCombineMinNumMaxNum (#93555)

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   19 +-
 .../AMDGPU/select-flags-to-fmin-fmax.ll       | 1757 +++++++++++++++++
 2 files changed, 1768 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 93d866384b4829..2f4fdf5208d076 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11186,17 +11186,19 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) {
   return SDValue();
 }
 
-// FIXME: This should be checking for no signed zeros on individual operands, as
-// well as no nans.
 static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS,
-                                         SDValue RHS,
+                                         SDValue RHS, const SDNodeFlags Flags,
                                          const TargetLowering &TLI) {
-  const TargetOptions &Options = DAG.getTarget().Options;
   EVT VT = LHS.getValueType();
+  if (!VT.isFloatingPoint())
+    return false;
+
+  const TargetOptions &Options = DAG.getTarget().Options;
 
-  return Options.NoSignedZerosFPMath && VT.isFloatingPoint() &&
+  return (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) &&
          TLI.isProfitableToCombineMinNumMaxNum(VT) &&
-         DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS);
+         (Flags.hasNoNaNs() ||
+          (DAG.isKnownNeverNaN(RHS) && DAG.isKnownNeverNaN(LHS)));
 }
 
 static SDValue combineMinNumMaxNumImpl(const SDLoc &DL, EVT VT, SDValue LHS,
@@ -11674,7 +11676,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     // select (fcmp gt x, y), x, y -> fmaxnum x, y
     //
     // This is OK if we don't care what happens if either operand is a NaN.
-    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI))
+    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, Flags, TLI))
       if (SDValue FMinMax =
               combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2, CC))
         return FMinMax;
@@ -12267,7 +12269,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     // This is OK if we don't care about what happens if either operand is a
     // NaN.
     //
-    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) {
+    if (N0.hasOneUse() &&
+        isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, N->getFlags(), TLI)) {
       if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC))
         return FMinMax;
     }
diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
new file mode 100644
index 00000000000000..50a3336a7483c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
@@ -0,0 +1,1757 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+; Test if fcmp+select patterns form min/max instructions when allowed
+; by flags.
+
+; TODO: Merge with fmin_legacy.ll/fmax_legacy.ll
+
+define float @v_test_fmin_legacy_ule_f32_safe(float %a, float %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f32_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f32_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f32_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmin_legacy_ule_f32_nnan_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f32_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f32_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f32_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule float %a, %b
+  %val = select nnan i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmin_legacy_ule_f32_nsz_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f32_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f32_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f32_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule float %a, %b
+  %val = select nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmin_legacy_ule_f32_nnan_nsz_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f32_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f32_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f32_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_safe(float %a, float %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_nnan_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge float %a, %b
+  %val = select nnan i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_nsz_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge float %a, %b
+  %val = select nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_nnan_nsz_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define <2 x float> @v_test_fmin_legacy_ule_v2f32_safe(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x float> %a, %b
+  %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x float> %a, %b
+  %val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmin_legacy_ule_v2f32_nsz_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x float> %a, %b
+  %val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x float> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmax_legacy_uge_v2f32_safe(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x float> %a, %b
+  %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x float> %a, %b
+  %val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmax_legacy_uge_v2f32_nsz_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x float> %a, %b
+  %val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x float> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select nnan i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmin_legacy_ule_f16_nnan_nsz_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select nnan nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge half %a, %b
+  %val = select i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge half %a, %b
+  %val = select nnan i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge half %a, %b
+  %val = select nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmax_legacy_uge_f16_nnan_nsz_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge half %a, %b
+  %val = select nnan nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x half> %a, %b
+  %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x half> %a, %b
+  %val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x half> %a, %b
+  %val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x half> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <4 x half> %a, %b
+  %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <4 x half> %a, %b
+  %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <4 x half> %a, %b
+  %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <4 x half> %a, %b
+  %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define float @v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float %arg1) {
+; GFX7-LABEL: v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a = fadd nnan float %arg0, %arg0
+  %b = fadd nnan float %arg1, %arg1
+  %cmp = fcmp ule float %a, %b
+  %val = select nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float %arg1) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a = fadd nnan float %arg0, %arg0
+  %b = fadd nnan float %arg1, %arg1
+  %cmp = fcmp uge float %a, %b
+  %val = select nsz i1 %cmp, float %a, float %b
+  ret float %val
+}

From b963931eb8bda810e2a8ad08832402993b931d69 Mon Sep 17 00:00:00 2001
From: alx32 <103613512+alx32@users.noreply.github.com>
Date: Tue, 28 May 2024 10:21:22 -0700
Subject: [PATCH 36/89] [lld-macho][ObjC] Implement category merging into base
 class (#92448)

Currently category merging only supports merging multiple categories
into one. With this commit we add the ability to fully merge categories
into the base class, if the base class is included in the current
module. This is the optimal approach for defined classes.
---
 lld/MachO/ObjC.cpp                            | 179 +++++++++++++--
 .../objc-category-merging-complete-test.s     | 210 ++++++++++++++++++
 ...imal.s => objc-category-merging-minimal.s} | 125 ++++++++++-
 3 files changed, 500 insertions(+), 14 deletions(-)
 rename lld/test/MachO/{objc-category-merging-extern-class-minimal.s => objc-category-merging-minimal.s} (59%)

diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 9d1612beae872e..635ded554497ba 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -379,12 +379,21 @@ class ObjcCategoryMerger {
     InfoWriteSection catPtrListInfo;
   };
 
-  // Information about a pointer list in the original categories (method lists,
-  // protocol lists, etc)
+  // Information about a pointer list in the original categories or class(method
+  // lists, protocol lists, etc)
   struct PointerListInfo {
+    PointerListInfo() = default;
+    PointerListInfo(const PointerListInfo &) = default;
     PointerListInfo(const char *_categoryPrefix, uint32_t _pointersPerStruct)
         : categoryPrefix(_categoryPrefix),
           pointersPerStruct(_pointersPerStruct) {}
+
+    inline bool operator==(const PointerListInfo &cmp) {
+      return pointersPerStruct == cmp.pointersPerStruct &&
+             structSize == cmp.structSize && structCount == cmp.structCount &&
+             allPtrs == cmp.allPtrs;
+    }
+
     const char *categoryPrefix;
 
     uint32_t pointersPerStruct = 0;
@@ -395,9 +404,9 @@ class ObjcCategoryMerger {
     std::vector<Symbol *> allPtrs;
   };
 
-  // Full information about all the categories that extend a class. This will
-  // include all the additional methods, protocols, and properties that are
-  // contained in all the categories that extend a particular class.
+  // Full information describing an ObjC class . This will include all the
+  // additional methods, protocols, and properties that are contained in the
+  // class and all the categories that extend a particular class.
   struct ClassExtensionInfo {
     ClassExtensionInfo(CategoryLayout &_catLayout) : catLayout(_catLayout){};
 
@@ -449,6 +458,9 @@ class ObjcCategoryMerger {
   void parseProtocolListInfo(const ConcatInputSection *isec, uint32_t secOffset,
                              PointerListInfo &ptrList);
 
+  PointerListInfo parseProtocolListInfo(const ConcatInputSection *isec,
+                                        uint32_t secOffset);
+
   void parsePointerListInfo(const ConcatInputSection *isec, uint32_t secOffset,
                             PointerListInfo &ptrList);
 
@@ -456,9 +468,9 @@ class ObjcCategoryMerger {
                               const ClassExtensionInfo &extInfo,
                               const PointerListInfo &ptrList);
 
-  void emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
-                               const ClassExtensionInfo &extInfo,
-                               const PointerListInfo &ptrList);
+  Defined *emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
+                                   const ClassExtensionInfo &extInfo,
+                                   const PointerListInfo &ptrList);
 
   Defined *emitCategory(const ClassExtensionInfo &extInfo);
   Defined *emitCatListEntrySec(const std::string &forCategoryName,
@@ -474,6 +486,10 @@ class ObjcCategoryMerger {
                                    uint32_t offset);
   Defined *tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
                                      uint32_t offset);
+  Defined *getClassRo(const Defined *classSym, bool getMetaRo);
+  void mergeCategoriesIntoBaseClass(const Defined *baseClass,
+                                    std::vector<InfoInputCategory> &categories);
+  void eraseSymbolAtIsecOffset(ConcatInputSection *isec, uint32_t offset);
   void tryEraseDefinedAtIsecOffset(const ConcatInputSection *isec,
                                    uint32_t offset);
 
@@ -552,6 +568,29 @@ ObjcCategoryMerger::tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
   return dyn_cast_or_null<Defined>(sym);
 }
 
+// Get the class's ro_data symbol. If getMetaRo is true, then we will return
+// the meta-class's ro_data symbol. Otherwise, we will return the class
+// (instance) ro_data symbol.
+Defined *ObjcCategoryMerger::getClassRo(const Defined *classSym,
+                                        bool getMetaRo) {
+  ConcatInputSection *isec = dyn_cast<ConcatInputSection>(classSym->isec());
+  if (!isec)
+    return nullptr;
+
+  if (!getMetaRo)
+    return tryGetDefinedAtIsecOffset(isec, classLayout.roDataOffset +
+                                               classSym->value);
+
+  Defined *metaClass = tryGetDefinedAtIsecOffset(
+      isec, classLayout.metaClassOffset + classSym->value);
+  if (!metaClass)
+    return nullptr;
+
+  return tryGetDefinedAtIsecOffset(
+      dyn_cast<ConcatInputSection>(metaClass->isec()),
+      classLayout.roDataOffset);
+}
+
 // Given an ConcatInputSection or CStringInputSection and an offset, if there is
 // a symbol(Defined) at that offset, then erase the symbol (mark it not live)
 void ObjcCategoryMerger::tryEraseDefinedAtIsecOffset(
@@ -663,6 +702,15 @@ void ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec,
          "Protocol list end offset does not match expected size");
 }
 
+// Parse a protocol list and return the PointerListInfo for it
+ObjcCategoryMerger::PointerListInfo
+ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec,
+                                          uint32_t secOffset) {
+  PointerListInfo ptrList;
+  parseProtocolListInfo(isec, secOffset, ptrList);
+  return ptrList;
+}
+
 // Parse a pointer list that might be linked to ConcatInputSection at a given
 // offset. This can be used for instance methods, class methods, instance props
 // and class props since they have the same format.
@@ -769,11 +817,11 @@ void ObjcCategoryMerger::parseCatInfoToExtInfo(const InfoInputCategory &catInfo,
 
 // Generate a protocol list (including header) and link it into the parent at
 // the specified offset.
-void ObjcCategoryMerger::emitAndLinkProtocolList(
+Defined *ObjcCategoryMerger::emitAndLinkProtocolList(
     Defined *parentSym, uint32_t linkAtOffset,
     const ClassExtensionInfo &extInfo, const PointerListInfo &ptrList) {
   if (ptrList.allPtrs.empty())
-    return;
+    return nullptr;
 
   assert(ptrList.allPtrs.size() == ptrList.structCount);
 
@@ -820,6 +868,8 @@ void ObjcCategoryMerger::emitAndLinkProtocolList(
                           infoCategoryWriter.catPtrListInfo.relocTemplate);
     offset += target->wordSize;
   }
+
+  return ptrListSym;
 }
 
 // Generate a pointer list (including header) and link it into the parent at the
@@ -1265,10 +1315,15 @@ void ObjcCategoryMerger::removeRefsToErasedIsecs() {
 void ObjcCategoryMerger::doMerge() {
   collectAndValidateCategoriesData();
 
-  for (auto &entry : categoryMap)
-    if (entry.second.size() > 1)
+  for (auto &[baseClass, catInfos] : categoryMap) {
+    if (auto *baseClassDef = dyn_cast<Defined>(baseClass)) {
+      // Merge all categories into the base class
+      mergeCategoriesIntoBaseClass(baseClassDef, catInfos);
+    } else if (catInfos.size() > 1) {
       // Merge all categories into a new, single category
-      mergeCategoriesIntoSingleCategory(entry.second);
+      mergeCategoriesIntoSingleCategory(catInfos);
+    }
+  }
 
   // Erase all categories that were merged
   eraseMergedCategories();
@@ -1302,3 +1357,101 @@ void objc::mergeCategories() {
 }
 
 void objc::doCleanup() { ObjcCategoryMerger::doCleanup(); }
+
+void ObjcCategoryMerger::mergeCategoriesIntoBaseClass(
+    const Defined *baseClass, std::vector<InfoInputCategory> &categories) {
+  assert(categories.size() >= 1 && "Expected at least one category to merge");
+
+  // Collect all the info from the categories
+  ClassExtensionInfo extInfo(catLayout);
+  for (auto &catInfo : categories) {
+    parseCatInfoToExtInfo(catInfo, extInfo);
+  }
+
+  // Get metadata for the base class
+  Defined *metaRo = getClassRo(baseClass, /*getMetaRo=*/true);
+  ConcatInputSection *metaIsec = dyn_cast<ConcatInputSection>(metaRo->isec());
+  Defined *classRo = getClassRo(baseClass, /*getMetaRo=*/false);
+  ConcatInputSection *classIsec = dyn_cast<ConcatInputSection>(classRo->isec());
+
+  // Now collect the info from the base class from the various lists in the
+  // class metadata
+
+  // Protocol lists are a special case - the same protocol list is in classRo
+  // and metaRo, so we only need to parse it once
+  parseProtocolListInfo(classIsec, roClassLayout.baseProtocolsOffset,
+                        extInfo.protocols);
+
+  // Check that the classRo and metaRo protocol lists are identical
+  assert(
+      parseProtocolListInfo(classIsec, roClassLayout.baseProtocolsOffset) ==
+          parseProtocolListInfo(metaIsec, roClassLayout.baseProtocolsOffset) &&
+      "Category merger expects classRo and metaRo to have the same protocol "
+      "list");
+
+  parsePointerListInfo(metaIsec, roClassLayout.baseMethodsOffset,
+                       extInfo.classMethods);
+  parsePointerListInfo(classIsec, roClassLayout.baseMethodsOffset,
+                       extInfo.instanceMethods);
+
+  parsePointerListInfo(metaIsec, roClassLayout.basePropertiesOffset,
+                       extInfo.classProps);
+  parsePointerListInfo(classIsec, roClassLayout.basePropertiesOffset,
+                       extInfo.instanceProps);
+
+  // Erase the old lists - these will be generated and replaced
+  eraseSymbolAtIsecOffset(metaIsec, roClassLayout.baseMethodsOffset);
+  eraseSymbolAtIsecOffset(metaIsec, roClassLayout.baseProtocolsOffset);
+  eraseSymbolAtIsecOffset(metaIsec, roClassLayout.basePropertiesOffset);
+  eraseSymbolAtIsecOffset(classIsec, roClassLayout.baseMethodsOffset);
+  eraseSymbolAtIsecOffset(classIsec, roClassLayout.baseProtocolsOffset);
+  eraseSymbolAtIsecOffset(classIsec, roClassLayout.basePropertiesOffset);
+
+  // Emit the newly merged lists - first into the meta RO then into the class RO
+  // First we emit and link the protocol list into the meta RO. Then we link it
+  // in the classRo as well (they're supposed to be identical)
+  if (Defined *protoListSym =
+          emitAndLinkProtocolList(metaRo, roClassLayout.baseProtocolsOffset,
+                                  extInfo, extInfo.protocols)) {
+    createSymbolReference(classRo, protoListSym,
+                          roClassLayout.baseProtocolsOffset,
+                          infoCategoryWriter.catBodyInfo.relocTemplate);
+  }
+
+  emitAndLinkPointerList(metaRo, roClassLayout.baseMethodsOffset, extInfo,
+                         extInfo.classMethods);
+  emitAndLinkPointerList(classRo, roClassLayout.baseMethodsOffset, extInfo,
+                         extInfo.instanceMethods);
+
+  emitAndLinkPointerList(metaRo, roClassLayout.basePropertiesOffset, extInfo,
+                         extInfo.classProps);
+
+  emitAndLinkPointerList(classRo, roClassLayout.basePropertiesOffset, extInfo,
+                         extInfo.instanceProps);
+
+  // Mark all the categories as merged - this will be used to erase them later
+  for (auto &catInfo : categories)
+    catInfo.wasMerged = true;
+}
+
+// Erase the symbol at a given offset in an InputSection
+void ObjcCategoryMerger::eraseSymbolAtIsecOffset(ConcatInputSection *isec,
+                                                 uint32_t offset) {
+  Defined *sym = tryGetDefinedAtIsecOffset(isec, offset);
+  if (!sym)
+    return;
+
+  // Remove the symbol from isec->symbols
+  assert(isa<Defined>(sym) && "Can only erase a Defined");
+  llvm::erase(isec->symbols, sym);
+
+  // Remove the relocs that refer to this symbol
+  auto removeAtOff = [offset](Reloc const &r) { return r.offset == offset; };
+  llvm::erase_if(isec->relocs, removeAtOff);
+
+  // Now, if the symbol fully occupies a ConcatInputSection, we can also erase
+  // the whole ConcatInputSection
+  if (ConcatInputSection *cisec = dyn_cast<ConcatInputSection>(sym->isec()))
+    if (cisec->data.size() == sym->size)
+      eraseISec(cisec);
+}
diff --git a/lld/test/MachO/objc-category-merging-complete-test.s b/lld/test/MachO/objc-category-merging-complete-test.s
index 74400177b550dc..cf3e19e2f9c8b4 100644
--- a/lld/test/MachO/objc-category-merging-complete-test.s
+++ b/lld/test/MachO/objc-category-merging-complete-test.s
@@ -1,6 +1,7 @@
 # REQUIRES: aarch64
 # RUN: rm -rf %t; split-file %s %t && cd %t
 
+############ Test merging multiple categories into a single category ############
 ## Create a dylib to link against(a64_file1.dylib) and merge categories in the main binary (file2_merge_a64.exe)
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_file1.o a64_file1.s
 # RUN: %lld -arch arm64 a64_file1.o -o a64_file1.dylib -dylib
@@ -12,6 +13,10 @@
 # RUN: llvm-objdump --objc-meta-data --macho a64_file2_no_merge.exe | FileCheck %s --check-prefixes=NO_MERGE_CATS
 # RUN: llvm-objdump --objc-meta-data --macho a64_file2_merge.exe | FileCheck %s --check-prefixes=MERGE_CATS
 
+############ Test merging multiple categories into the base class ############
+# RUN: %lld -arch arm64 -o a64_file2_merge_into_class.exe -objc_category_merging a64_file1.o a64_file2.o
+# RUN: llvm-objdump --objc-meta-data --macho a64_file2_merge_into_class.exe | FileCheck %s --check-prefixes=MERGE_CATS_CLS
+
 
 MERGE_CATS:     __OBJC_$_CATEGORY_MyBaseClass(Category02|Category03)
 MERGE_CATS-NEXT:              name {{.*}} Category02|Category03
@@ -101,6 +106,211 @@ NO_MERGE_CATS-NEXT: 24
 NO_MERGE_CATS-NEXT: 2
 
 
+MERGE_CATS_CLS:        _OBJC_CLASS_$_MyBaseClass
+MERGE_CATS_CLS-NEXT:            isa {{.*}} _OBJC_METACLASS_$_MyBaseClass
+MERGE_CATS_CLS-NEXT:     superclass 0x0
+MERGE_CATS_CLS-NEXT:          cache {{.*}} __objc_empty_cache
+MERGE_CATS_CLS-NEXT:         vtable 0x0
+MERGE_CATS_CLS-NEXT:           data {{.*}} (struct class_ro_t *)
+MERGE_CATS_CLS-NEXT:                     flags 0x2 RO_ROOT
+MERGE_CATS_CLS-NEXT:             instanceStart 0
+MERGE_CATS_CLS-NEXT:              instanceSize 4
+MERGE_CATS_CLS-NEXT:                  reserved 0x0
+MERGE_CATS_CLS-NEXT:                ivarLayout 0x0
+MERGE_CATS_CLS-NEXT:                      name {{.*}} MyBaseClass
+MERGE_CATS_CLS-NEXT:               baseMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:            entsize 24
+MERGE_CATS_CLS-NEXT:              count 8
+MERGE_CATS_CLS-NEXT:               name {{.*}} class02InstanceMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass(Category02) class02InstanceMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} myProtocol02Method
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass(Category02) myProtocol02Method]
+MERGE_CATS_CLS-NEXT:               name {{.*}} class03InstanceMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass(Category03) class03InstanceMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} myProtocol03Method
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass(Category03) myProtocol03Method]
+MERGE_CATS_CLS-NEXT:               name {{.*}} baseInstanceMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass baseInstanceMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} myProtocol01Method
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass myProtocol01Method]
+MERGE_CATS_CLS-NEXT:               name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:              types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass MyProtocol01Prop]
+MERGE_CATS_CLS-NEXT:               name {{.*}} setMyProtocol01Prop:
+MERGE_CATS_CLS-NEXT:              types {{.*}} v20@0:8i16
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass setMyProtocol01Prop:]
+MERGE_CATS_CLS-NEXT:             baseProtocols {{.*}}
+MERGE_CATS_CLS-NEXT:                       count 3
+MERGE_CATS_CLS-NEXT:               list[0] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol02
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 2
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol02Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol02Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:               list[1] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol03
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 2
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol03Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol03Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:               list[2] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol01
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 3
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol01Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} setMyProtocol01Prop:
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v20@0:8i16
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:                     ivars {{.*}}
+MERGE_CATS_CLS-NEXT:                     entsize 32
+MERGE_CATS_CLS-NEXT:                       count 1
+MERGE_CATS_CLS-NEXT:                offset {{.*}} 0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:                  type {{.*}} i
+MERGE_CATS_CLS-NEXT:             alignment 2
+MERGE_CATS_CLS-NEXT:                  size 4
+MERGE_CATS_CLS-NEXT:            weakIvarLayout 0x0
+MERGE_CATS_CLS-NEXT:            baseProperties {{.*}}
+MERGE_CATS_CLS-NEXT:                     entsize 16
+MERGE_CATS_CLS-NEXT:                       count 3
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol02Prop
+MERGE_CATS_CLS-NEXT:             attributes {{.*}} Ti,R,D
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol03Prop
+MERGE_CATS_CLS-NEXT:             attributes {{.*}} Ti,R,D
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:             attributes {{.*}} Ti,N,VMyProtocol01Prop
+MERGE_CATS_CLS-NEXT: Meta Class
+MERGE_CATS_CLS-NEXT:            isa {{.*}} _OBJC_METACLASS_$_MyBaseClass
+MERGE_CATS_CLS-NEXT:     superclass {{.*}} _OBJC_CLASS_$_MyBaseClass
+MERGE_CATS_CLS-NEXT:          cache {{.*}} __objc_empty_cache
+MERGE_CATS_CLS-NEXT:         vtable 0x0
+MERGE_CATS_CLS-NEXT:           data {{.*}} (struct class_ro_t *)
+MERGE_CATS_CLS-NEXT:                     flags 0x3 RO_META RO_ROOT
+MERGE_CATS_CLS-NEXT:             instanceStart 40
+MERGE_CATS_CLS-NEXT:              instanceSize 40
+MERGE_CATS_CLS-NEXT:                  reserved 0x0
+MERGE_CATS_CLS-NEXT:                ivarLayout 0x0
+MERGE_CATS_CLS-NEXT:                      name {{.*}} MyBaseClass
+MERGE_CATS_CLS-NEXT:               baseMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:            entsize 24
+MERGE_CATS_CLS-NEXT:              count 5
+MERGE_CATS_CLS-NEXT:               name {{.*}} class02ClassMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass(Category02) class02ClassMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} MyProtocol02Prop
+MERGE_CATS_CLS-NEXT:              types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass(Category02) MyProtocol02Prop]
+MERGE_CATS_CLS-NEXT:               name {{.*}} class03ClassMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass(Category03) class03ClassMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} MyProtocol03Prop
+MERGE_CATS_CLS-NEXT:              types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass(Category03) MyProtocol03Prop]
+MERGE_CATS_CLS-NEXT:               name {{.*}} baseClassMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass baseClassMethod]
+MERGE_CATS_CLS-NEXT:             baseProtocols {{.*}}
+MERGE_CATS_CLS-NEXT:                       count 3
+MERGE_CATS_CLS-NEXT:               list[0] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol02
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 2
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol02Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol02Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:               list[1] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol03
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 2
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol03Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol03Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:               list[2] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol01
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 3
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol01Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} setMyProtocol01Prop:
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v20@0:8i16
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:                     ivars 0x0
+MERGE_CATS_CLS-NEXT:            weakIvarLayout 0x0
+MERGE_CATS_CLS-NEXT:            baseProperties 0x0
+MERGE_CATS_CLS:        __OBJC_$_CATEGORY_MyBaseClass_$_Category04
+
+
 #--- a64_file1.s
 
 ## @protocol MyProtocol01
diff --git a/lld/test/MachO/objc-category-merging-extern-class-minimal.s b/lld/test/MachO/objc-category-merging-minimal.s
similarity index 59%
rename from lld/test/MachO/objc-category-merging-extern-class-minimal.s
rename to lld/test/MachO/objc-category-merging-minimal.s
index 5dd8924df5ad68..fcd90f178b150e 100644
--- a/lld/test/MachO/objc-category-merging-extern-class-minimal.s
+++ b/lld/test/MachO/objc-category-merging-minimal.s
@@ -1,7 +1,8 @@
 # REQUIRES: aarch64
 # RUN: rm -rf %t; split-file %s %t && cd %t
 
-## Create a dylib with a fake base class to link against
+############ Test merging multiple categories into a single category ############
+## Create a dylib with a fake base class to link against in when merging between categories
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_fakedylib.o a64_fakedylib.s
 # RUN: %lld -arch arm64 a64_fakedylib.o -o a64_fakedylib.dylib -dylib
 
@@ -14,6 +15,15 @@
 # RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_no_merge.dylib | FileCheck %s --check-prefixes=NO_MERGE_CATS
 # RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_merge.dylib | FileCheck %s --check-prefixes=MERGE_CATS
 
+############ Test merging multiple categories into the base class ############
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_base_class_minimal.o merge_base_class_minimal.s
+# RUN: %lld -arch arm64 -dylib -o merge_base_class_minimal_yes_merge.dylib -objc_category_merging merge_base_class_minimal.o merge_cat_minimal.o
+# RUN: %lld -arch arm64 -dylib -o merge_base_class_minimal_no_merge.dylib merge_base_class_minimal.o merge_cat_minimal.o
+
+# RUN: llvm-objdump --objc-meta-data --macho merge_base_class_minimal_no_merge.dylib  | FileCheck %s --check-prefixes=NO_MERGE_INTO_BASE
+# RUN: llvm-objdump --objc-meta-data --macho merge_base_class_minimal_yes_merge.dylib | FileCheck %s --check-prefixes=YES_MERGE_INTO_BASE
+
+
 #### Check merge categories enabled ###
 # Check that the original categories are not there
 MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
@@ -44,6 +54,28 @@ NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
 NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
 
 
+#### Check merge cateogires into base class is disabled ####
+NO_MERGE_INTO_BASE: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+NO_MERGE_INTO_BASE: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+
+#### Check merge cateogires into base class is enabled and categories are merged into base class ####
+YES_MERGE_INTO_BASE-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+YES_MERGE_INTO_BASE-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+
+YES_MERGE_INTO_BASE: _OBJC_CLASS_$_MyBaseClass
+YES_MERGE_INTO_BASE-NEXT: _OBJC_METACLASS_$_MyBaseClass
+YES_MERGE_INTO_BASE: baseMethods
+YES_MERGE_INTO_BASE-NEXT: entsize 24
+YES_MERGE_INTO_BASE-NEXT: count 3
+YES_MERGE_INTO_BASE-NEXT: name {{.*}} cat01_InstanceMethod
+YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8
+YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass(Category01) cat01_InstanceMethod]
+YES_MERGE_INTO_BASE-NEXT: name {{.*}} cat02_InstanceMethod
+YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8
+YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass(Category02) cat02_InstanceMethod]
+YES_MERGE_INTO_BASE-NEXT: name {{.*}} baseInstanceMethod
+YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8
+YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass baseInstanceMethod]
 
 #--- a64_fakedylib.s
 
@@ -156,3 +188,94 @@ L_OBJC_IMAGE_INFO:
 
 .addrsig
 .addrsig_sym __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+
+#--- merge_base_class_minimal.s
+; clang -c merge_base_class_minimal.mm -O3 -target arm64-apple-macos -arch arm64 -S -o merge_base_class_minimal.s
+;  ================== Generated from ObjC: ==================
+; __attribute__((objc_root_class))
+; @interface MyBaseClass
+; - (void)baseInstanceMethod;
+; @end
+;
+; @implementation MyBaseClass
+; - (void)baseInstanceMethod {}
+; @end
+;  ================== Generated from ObjC  ==================
+	.section	__TEXT,__text,regular,pure_instructions
+	.build_version macos, 11, 0
+	.p2align	2
+"-[MyBaseClass baseInstanceMethod]":
+	.cfi_startproc
+; %bb.0:
+	ret
+	.cfi_endproc
+	.section	__DATA,__objc_data
+	.globl	_OBJC_CLASS_$_MyBaseClass
+	.p2align	3, 0x0
+_OBJC_CLASS_$_MyBaseClass:
+	.quad	_OBJC_METACLASS_$_MyBaseClass
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	__OBJC_CLASS_RO_$_MyBaseClass
+	.globl	_OBJC_METACLASS_$_MyBaseClass
+	.p2align	3, 0x0
+_OBJC_METACLASS_$_MyBaseClass:
+	.quad	_OBJC_METACLASS_$_MyBaseClass
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	0
+	.quad	0
+	.quad	__OBJC_METACLASS_RO_$_MyBaseClass
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:
+	.asciz	"MyBaseClass"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0
+__OBJC_METACLASS_RO_$_MyBaseClass:
+	.long	3
+	.long	40
+	.long	40
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:
+	.asciz	"baseInstanceMethod"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:
+	.asciz	"v16@0:8"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0
+__OBJC_$_INSTANCE_METHODS_MyBaseClass:
+	.long	24
+	.long	1
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass baseInstanceMethod]"
+	.p2align	3, 0x0
+__OBJC_CLASS_RO_$_MyBaseClass:
+	.long	2
+	.long	0
+	.long	0
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	__OBJC_$_INSTANCE_METHODS_MyBaseClass
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.section	__DATA,__objc_classlist,regular,no_dead_strip
+	.p2align	3, 0x0
+l_OBJC_LABEL_CLASS_$:
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	64
+.subsections_via_symbols

From d1d863c012cf3d5b407ae06d23a5628ec9510b7c Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 28 May 2024 10:32:09 -0700
Subject: [PATCH 37/89] [lldb] Remove lldbassert in AppleObjCTypeEncodingParser
 (#93332)

AppleObjCTypeEncodingParser::BuildObjCObjectPointerType currently
contains an lldbassert to detect situations where we have a forward
declaration without a definition. According to the accompanying comment,
its purpose is to catch "weird cases" during test suite runs.

However, because this is an lldbassert, we show a scary message to our
users who think this is a problem and report the issue to us.
Unfortunately those reports aren't very actionable without a way to know
the name of the type.

This patch changes the lldbassert to a regular assert and emits a log
message to the types log when this happens.

rdar://127439898
---
 .../AppleObjCTypeEncodingParser.cpp               | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
index ca582cb1d5a46f..4871c59faefccc 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
@@ -13,6 +13,8 @@
 #include "lldb/Symbol/CompilerType.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
 #include "lldb/Utility/StringLexer.h"
 
 #include "clang/Basic/TargetInfo.h"
@@ -234,12 +236,15 @@ clang::QualType AppleObjCTypeEncodingParser::BuildObjCObjectPointerType(
 
     auto types = decl_vendor->FindTypes(ConstString(name), /*max_matches*/ 1);
 
-    // The user can forward-declare something that has no definition.  The runtime
-    // doesn't prohibit this at all. This is a rare and very weird case.  We keep
-    // this assert in debug builds so we catch other weird cases.
-    lldbassert(!types.empty());
-    if (types.empty())
+    if (types.empty()) {
+      // The user can forward-declare something that has no definition. The
+      // runtime doesn't prohibit this at all. This is a rare and very weird
+      // case. Assert assert in debug builds so we catch other weird cases.
+      assert(false && "forward declaration without definition");
+      LLDB_LOG(GetLog(LLDBLog::Types),
+               "forward declaration without definition: {0}", name)
       return ast_ctx.getObjCIdType();
+    }
 
     return ClangUtil::GetQualType(types.front().GetPointerType());
   } else {

From f69b6d2c99a10847a2d73c7fcd656d2ae22937ce Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 28 May 2024 10:36:20 -0700
Subject: [PATCH 38/89] [lldb] Add missing semicolon (NFC)

---
 .../ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
index 4871c59faefccc..ddaa7a8a597b4f 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
@@ -242,7 +242,7 @@ clang::QualType AppleObjCTypeEncodingParser::BuildObjCObjectPointerType(
       // case. Assert assert in debug builds so we catch other weird cases.
       assert(false && "forward declaration without definition");
       LLDB_LOG(GetLog(LLDBLog::Types),
-               "forward declaration without definition: {0}", name)
+               "forward declaration without definition: {0}", name);
       return ast_ctx.getObjCIdType();
     }
 

From c09787b7d05083791b417c5b97a8cfd6d0874ed9 Mon Sep 17 00:00:00 2001
From: Franklin Zhang <zhangfenglei@huawei.com>
Date: Wed, 29 May 2024 01:39:35 +0800
Subject: [PATCH 39/89] [OMPT] Set default values for tsan function pointers
 (#93568)

Avoid calling NULL function pointers in cases where ompt_start_tool
succeeds but those tsan functions
do not really exist.

Fix https://github.com/llvm/llvm-project/issues/93524

---------

Co-authored-by: Joachim <protze@rz.rwth-aachen.de>
---
 openmp/tools/archer/ompt-tsan.cpp | 33 +++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
index de77e25db2d399..d7658077e83ae0 100644
--- a/openmp/tools/archer/ompt-tsan.cpp
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -19,6 +19,7 @@
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
+#include <dlfcn.h>
 #include <inttypes.h>
 #include <iostream>
 #include <list>
@@ -29,7 +30,6 @@
 #include <unistd.h>
 #include <unordered_map>
 #include <vector>
-#include <dlfcn.h>
 
 #include "omp-tools.h"
 
@@ -146,18 +146,28 @@ void __attribute__((weak)) __tsan_flush_memory() {}
 static ArcherFlags *archer_flags;
 
 #ifndef TsanHappensBefore
+
+template <typename... Args> static void __ompt_tsan_func(Args...) {}
+
+#define DECLARE_TSAN_FUNCTION(name, ...)                                       \
+  static void (*name)(__VA_ARGS__) = __ompt_tsan_func<__VA_ARGS__>;
+
 // Thread Sanitizer is a tool that finds races in code.
 // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
 // tsan detects these exact functions by name.
 extern "C" {
-static void (*AnnotateHappensAfter)(const char *, int, const volatile void *);
-static void (*AnnotateHappensBefore)(const char *, int, const volatile void *);
-static void (*AnnotateIgnoreWritesBegin)(const char *, int);
-static void (*AnnotateIgnoreWritesEnd)(const char *, int);
-static void (*AnnotateNewMemory)(const char *, int, const volatile void *,
-                                 size_t);
-static void (*__tsan_func_entry)(const void *);
-static void (*__tsan_func_exit)(void);
+DECLARE_TSAN_FUNCTION(AnnotateHappensAfter, const char *, int,
+                      const volatile void *)
+DECLARE_TSAN_FUNCTION(AnnotateHappensBefore, const char *, int,
+                      const volatile void *)
+DECLARE_TSAN_FUNCTION(AnnotateIgnoreWritesBegin, const char *, int)
+DECLARE_TSAN_FUNCTION(AnnotateIgnoreWritesEnd, const char *, int)
+DECLARE_TSAN_FUNCTION(AnnotateNewMemory, const char *, int,
+                      const volatile void *, size_t)
+DECLARE_TSAN_FUNCTION(__tsan_func_entry, const void *)
+DECLARE_TSAN_FUNCTION(__tsan_func_exit)
+
+// RunningOnValgrind is used to detect absence of TSan and must intentionally be a nullptr.
 static int (*RunningOnValgrind)(void);
 }
 
@@ -1142,7 +1152,10 @@ static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
 
 #define findTsanFunction(f, fSig)                                              \
   do {                                                                         \
-    if (NULL == (f = fSig dlsym(RTLD_DEFAULT, #f)))                            \
+    void *fp = dlsym(RTLD_DEFAULT, #f);                                        \
+    if (fp)                                                                    \
+      f = fSig fp;                                                             \
+    else                                                                       \
       printf("Unable to find TSan function " #f ".\n");                        \
   } while (0)
 

From ef67f31e88dbae46811f03da945cfb8130c6fa15 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 28 May 2024 10:51:21 -0700
Subject: [PATCH 40/89] [SCEV] Compute symbolic max backedge taken count in BTI
 directly. (NFC)

Move symbolic max backedge taken count computation to BackedgeTakenInfo,
use existing ExitNotTaken info.

In preparation for https://github.com/llvm/llvm-project/pull/93498.
---
 llvm/include/llvm/Analysis/ScalarEvolution.h |  5 --
 llvm/lib/Analysis/ScalarEvolution.cpp        | 48 +++++++++-----------
 2 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 5828cc156cc785..1d016b28347d27 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1761,11 +1761,6 @@ class ScalarEvolution {
   ExitLimit computeExitLimit(const Loop *L, BasicBlock *ExitingBlock,
                              bool AllowPredicates = false);
 
-  /// Return a symbolic upper bound for the backedge taken count of the loop.
-  /// This is more general than getConstantMaxBackedgeTakenCount as it returns
-  /// an arbitrary expression as opposed to only constants.
-  const SCEV *computeSymbolicMaxBackedgeTakenCount(const Loop *L);
-
   // Helper functions for computeExitLimitFromCond to avoid exponential time
   // complexity.
 
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 8d971e6a78e420..bb56b41fe15d58 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8647,8 +8647,28 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const {
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L,
                                                    ScalarEvolution *SE) {
-  if (!SymbolicMax)
-    SymbolicMax = SE->computeSymbolicMaxBackedgeTakenCount(L);
+  if (!SymbolicMax) {
+    // Form an expression for the maximum exit count possible for this loop. We
+    // merge the max and exact information to approximate a version of
+    // getConstantMaxBackedgeTakenCount which isn't restricted to just
+    // constants.
+    SmallVector<const SCEV *, 4> ExitCounts;
+
+    for (const auto &ENT : ExitNotTaken) {
+      const SCEV *ExitCount = ENT.SymbolicMaxNotTaken;
+      if (!isa<SCEVCouldNotCompute>(ExitCount)) {
+        assert(SE->DT.dominates(ENT.ExitingBlock, L->getLoopLatch()) &&
+               "We should only have known counts for exiting blocks that "
+               "dominate latch!");
+        ExitCounts.push_back(ExitCount);
+      }
+    }
+    if (ExitCounts.empty())
+      SymbolicMax = SE->getCouldNotCompute();
+    else
+      SymbolicMax =
+          SE->getUMinFromMismatchedTypes(ExitCounts, /*Sequential*/ true);
+  }
   return SymbolicMax;
 }
 
@@ -14964,30 +14984,6 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
   return false;
 }
 
-const SCEV *
-ScalarEvolution::computeSymbolicMaxBackedgeTakenCount(const Loop *L) {
-  SmallVector<BasicBlock*, 16> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-
-  // Form an expression for the maximum exit count possible for this loop. We
-  // merge the max and exact information to approximate a version of
-  // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
-  SmallVector<const SCEV*, 4> ExitCounts;
-  for (BasicBlock *ExitingBB : ExitingBlocks) {
-    const SCEV *ExitCount =
-        getExitCount(L, ExitingBB, ScalarEvolution::SymbolicMaximum);
-    if (!isa<SCEVCouldNotCompute>(ExitCount)) {
-      assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
-             "We should only have known counts for exiting blocks that "
-             "dominate latch!");
-      ExitCounts.push_back(ExitCount);
-    }
-  }
-  if (ExitCounts.empty())
-    return getCouldNotCompute();
-  return getUMinFromMismatchedTypes(ExitCounts, /*Sequential*/ true);
-}
-
 /// A rewriter to replace SCEV expressions in Map with the corresponding entry
 /// in the map. It skips AddRecExpr because we cannot guarantee that the
 /// replacement is loop invariant in the loop of the AddRec.

From 0b2094c4553a63bb058c59073fc7c22d05e66977 Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 28 May 2024 19:56:04 +0200
Subject: [PATCH 41/89] [Clang] [NFC] Remove debug printing

---
 clang/lib/Sema/SemaStmtAttr.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 82373fe96a8243..6f538ed55cb72e 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -684,10 +684,8 @@ ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A,
   }
 
   if (!getLangOpts().CPlusPlus23 &&
-      A.getSyntax() == AttributeCommonInfo::AS_CXX11) {
-    llvm::dbgs() << "Syntax: " << int(A.getSyntax()) << "\n";
+      A.getSyntax() == AttributeCommonInfo::AS_CXX11)
     Diag(A.getLoc(), diag::ext_cxx23_attr) << A << Range;
-  }
 
   return Assumption;
 }

From f0899964e4041b1dc70dc66450a7f6b3e3a22262 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <joseph942010@gmail.com>
Date: Tue, 28 May 2024 13:59:49 -0400
Subject: [PATCH 42/89] [clang][Sema] Don't emit 'declared here' note for
 builtin functions with no decl in source (#93394)

Fixes #93369

---------

Co-authored-by: Timm Baeder <tbaeder@redhat.com>
Co-authored-by: S. B. Tam <cpplearner@outlook.com>
---
 clang/docs/ReleaseNotes.rst                         |  3 +++
 clang/lib/Sema/SemaLookup.cpp                       | 10 ++++++++++
 clang/test/SemaCXX/invalid-if-constexpr.cpp         |  3 +--
 clang/test/SemaCXX/typo-correction-builtin-func.cpp |  8 ++++++++
 4 files changed, 22 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/SemaCXX/typo-correction-builtin-func.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 173e61fbf7b2c1..894f6b04431744 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -541,6 +541,9 @@ Improvements to Clang's diagnostics
 - Clang emits a ``-Wparentheses`` warning for expressions with consecutive comparisons like ``x < y < z``.
   Fixes #GH20456.
 
+- Clang no longer emits a "declared here" note for a builtin function that has no declaration in source.
+  Fixes #GH93369.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index ef0a655b631ab4..be6ea20a956a39 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -5897,6 +5897,16 @@ void Sema::diagnoseTypo(const TypoCorrection &Correction,
 
   NamedDecl *ChosenDecl =
       Correction.isKeyword() ? nullptr : Correction.getFoundDecl();
+
+  // For builtin functions which aren't declared anywhere in source,
+  // don't emit the "declared here" note.
+  if (const auto *FD = dyn_cast_if_present<FunctionDecl>(ChosenDecl);
+      FD && FD->getBuiltinID() &&
+      PrevNote.getDiagID() == diag::note_previous_decl &&
+      Correction.getCorrectionRange().getBegin() == FD->getBeginLoc()) {
+    ChosenDecl = nullptr;
+  }
+
   if (PrevNote.getDiagID() && ChosenDecl)
     Diag(ChosenDecl->getLocation(), PrevNote)
       << CorrectedQuotedStr << (ErrorRecovery ? FixItHint() : FixTypo);
diff --git a/clang/test/SemaCXX/invalid-if-constexpr.cpp b/clang/test/SemaCXX/invalid-if-constexpr.cpp
index 7643c47488f057..0007f2739cbbd0 100644
--- a/clang/test/SemaCXX/invalid-if-constexpr.cpp
+++ b/clang/test/SemaCXX/invalid-if-constexpr.cpp
@@ -4,8 +4,7 @@ namespace GH61885 {
 void similar() { // expected-note {{'similar' declared here}}
   if constexpr (similer<>) {} // expected-error {{use of undeclared identifier 'similer'; did you mean 'similar'?}}
 }
-void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'; did you mean '__sync_swap'?}} \
-                                           // expected-note {{'__sync_swap' declared here}}
+void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'; did you mean '__sync_swap'?}}
 
 int AA() { return true;} // expected-note {{'AA' declared here}}
 
diff --git a/clang/test/SemaCXX/typo-correction-builtin-func.cpp b/clang/test/SemaCXX/typo-correction-builtin-func.cpp
new file mode 100644
index 00000000000000..8d369034d1be33
--- /dev/null
+++ b/clang/test/SemaCXX/typo-correction-builtin-func.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// Test that clang does not emit 'declared here' note for builtin functions that don't have a declaration in source.
+
+void t0() {
+  constexpr float A = __builtin_isinfinity(); // expected-error {{use of undeclared identifier '__builtin_isinfinity'; did you mean '__builtin_isfinite'?}}
+                                              // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}

From 73e22ff3d77db72bb9b6e22342417a5f4fe6afb4 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar@nvidia.com>
Date: Tue, 28 May 2024 11:05:38 -0700
Subject: [PATCH 43/89] [Reassociate] Preserve NSW flags after expr tree
 rewriting (#93105)

We can guarantee NSW on all operands in a reassociated add expression
tree when:

- All adds in an add operator tree are NSW, AND either
  - All add operands are guaranteed to be nonnegative, OR
  - All adds are also NUW

- Alive2:
- Nonnegative Operands
	- 3 operands: https://alive2.llvm.org/ce/z/G4XW6Q
	- 4 operands: https://alive2.llvm.org/ce/z/FWcZ6D
 - NUW NSW adds: https://alive2.llvm.org/ce/z/vRUxeC

---------

Co-authored-by: Nikita Popov <github@npopov.com>
---
 .../llvm/Transforms/Scalar/Reassociate.h      | 12 ++-
 llvm/lib/Transforms/Scalar/Reassociate.cpp    | 35 +++++---
 llvm/test/Transforms/Reassociate/local-cse.ll | 40 +++++-----
 .../Transforms/Reassociate/reassoc-add-nsw.ll | 79 +++++++++++++++++++
 4 files changed, 132 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll

diff --git a/llvm/include/llvm/Transforms/Scalar/Reassociate.h b/llvm/include/llvm/Transforms/Scalar/Reassociate.h
index f3a2e0f4380eb0..84d72df6fc4d81 100644
--- a/llvm/include/llvm/Transforms/Scalar/Reassociate.h
+++ b/llvm/include/llvm/Transforms/Scalar/Reassociate.h
@@ -63,6 +63,16 @@ struct Factor {
   Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {}
 };
 
+struct OverflowTracking {
+  bool HasNUW;
+  bool HasNSW;
+  bool AllKnownNonNegative;
+  // Note: AllKnownNonNegative can be true in a case where one of the operands
+  // is negative, but one the operators is not NSW. AllKnownNonNegative should
+  // not be used independently of HasNSW
+  OverflowTracking() : HasNUW(true), HasNSW(true), AllKnownNonNegative(true) {}
+};
+
 class XorOpnd;
 
 } // end namespace reassociate
@@ -103,7 +113,7 @@ class ReassociatePass : public PassInfoMixin<ReassociatePass> {
   void ReassociateExpression(BinaryOperator *I);
   void RewriteExprTree(BinaryOperator *I,
                        SmallVectorImpl<reassociate::ValueEntry> &Ops,
-                       bool HasNUW);
+                       reassociate::OverflowTracking Flags);
   Value *OptimizeExpression(BinaryOperator *I,
                             SmallVectorImpl<reassociate::ValueEntry> &Ops);
   Value *OptimizeAdd(Instruction *I,
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index d91320863e241d..c903e47a93cafd 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -471,7 +471,7 @@ using RepeatedValue = std::pair<Value*, APInt>;
 static bool LinearizeExprTree(Instruction *I,
                               SmallVectorImpl<RepeatedValue> &Ops,
                               ReassociatePass::OrderedSet &ToRedo,
-                              bool &HasNUW) {
+                              reassociate::OverflowTracking &Flags) {
   assert((isa<UnaryOperator>(I) || isa<BinaryOperator>(I)) &&
          "Expected a UnaryOperator or BinaryOperator!");
   LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
@@ -512,6 +512,7 @@ static bool LinearizeExprTree(Instruction *I,
   using LeafMap = DenseMap<Value *, APInt>;
   LeafMap Leaves; // Leaf -> Total weight so far.
   SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
+  const DataLayout DL = I->getModule()->getDataLayout();
 
 #ifndef NDEBUG
   SmallPtrSet<Value *, 8> Visited; // For checking the iteration scheme.
@@ -520,8 +521,10 @@ static bool LinearizeExprTree(Instruction *I,
     std::pair<Instruction*, APInt> P = Worklist.pop_back_val();
     I = P.first; // We examine the operands of this binary operator.
 
-    if (isa<OverflowingBinaryOperator>(I))
-      HasNUW &= I->hasNoUnsignedWrap();
+    if (isa<OverflowingBinaryOperator>(I)) {
+      Flags.HasNUW &= I->hasNoUnsignedWrap();
+      Flags.HasNSW &= I->hasNoSignedWrap();
+    }
 
     for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands.
       Value *Op = I->getOperand(OpIdx);
@@ -648,6 +651,8 @@ static bool LinearizeExprTree(Instruction *I,
     // Ensure the leaf is only output once.
     It->second = 0;
     Ops.push_back(std::make_pair(V, Weight));
+    if (Opcode == Instruction::Add && Flags.AllKnownNonNegative && Flags.HasNSW)
+      Flags.AllKnownNonNegative &= isKnownNonNegative(V, SimplifyQuery(DL));
   }
 
   // For nilpotent operations or addition there may be no operands, for example
@@ -666,7 +671,7 @@ static bool LinearizeExprTree(Instruction *I,
 /// linearized and optimized, emit them in-order.
 void ReassociatePass::RewriteExprTree(BinaryOperator *I,
                                       SmallVectorImpl<ValueEntry> &Ops,
-                                      bool HasNUW) {
+                                      OverflowTracking Flags) {
   assert(Ops.size() > 1 && "Single values should be used directly!");
 
   // Since our optimizations should never increase the number of operations, the
@@ -834,8 +839,12 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
           // Note that it doesn't hold for mul if one of the operands is zero.
           // TODO: We can preserve NUW flag if we prove that all mul operands
           // are non-zero.
-          if (HasNUW && ExpressionChangedStart->getOpcode() == Instruction::Add)
-            ExpressionChangedStart->setHasNoUnsignedWrap();
+          if (ExpressionChangedStart->getOpcode() == Instruction::Add) {
+            if (Flags.HasNUW)
+              ExpressionChangedStart->setHasNoUnsignedWrap();
+            if (Flags.HasNSW && (Flags.AllKnownNonNegative || Flags.HasNUW))
+              ExpressionChangedStart->setHasNoSignedWrap();
+          }
         }
       }
 
@@ -1192,8 +1201,8 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
     return nullptr;
 
   SmallVector<RepeatedValue, 8> Tree;
-  bool HasNUW = true;
-  MadeChange |= LinearizeExprTree(BO, Tree, RedoInsts, HasNUW);
+  OverflowTracking Flags;
+  MadeChange |= LinearizeExprTree(BO, Tree, RedoInsts, Flags);
   SmallVector<ValueEntry, 8> Factors;
   Factors.reserve(Tree.size());
   for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
@@ -1235,7 +1244,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
 
   if (!FoundFactor) {
     // Make sure to restore the operands to the expression tree.
-    RewriteExprTree(BO, Factors, HasNUW);
+    RewriteExprTree(BO, Factors, Flags);
     return nullptr;
   }
 
@@ -1247,7 +1256,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
     RedoInsts.insert(BO);
     V = Factors[0].Op;
   } else {
-    RewriteExprTree(BO, Factors, HasNUW);
+    RewriteExprTree(BO, Factors, Flags);
     V = BO;
   }
 
@@ -2373,8 +2382,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
   // First, walk the expression tree, linearizing the tree, collecting the
   // operand information.
   SmallVector<RepeatedValue, 8> Tree;
-  bool HasNUW = true;
-  MadeChange |= LinearizeExprTree(I, Tree, RedoInsts, HasNUW);
+  OverflowTracking Flags;
+  MadeChange |= LinearizeExprTree(I, Tree, RedoInsts, Flags);
   SmallVector<ValueEntry, 8> Ops;
   Ops.reserve(Tree.size());
   for (const RepeatedValue &E : Tree)
@@ -2567,7 +2576,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
              dbgs() << '\n');
   // Now that we ordered and optimized the expressions, splat them back into
   // the expression tree, removing any unneeded nodes.
-  RewriteExprTree(I, Ops, HasNUW);
+  RewriteExprTree(I, Ops, Flags);
 }
 
 void
diff --git a/llvm/test/Transforms/Reassociate/local-cse.ll b/llvm/test/Transforms/Reassociate/local-cse.ll
index 4d0467e263f553..d0d609f022b46b 100644
--- a/llvm/test/Transforms/Reassociate/local-cse.ll
+++ b/llvm/test/Transforms/Reassociate/local-cse.ll
@@ -26,16 +26,16 @@ define void @chain_spanning_several_blocks(i64 %inv1, i64 %inv2, i64 %inv3, i64
 ; LOCAL_CSE-LABEL: define void @chain_spanning_several_blocks
 ; LOCAL_CSE-SAME: (i64 [[INV1:%.*]], i64 [[INV2:%.*]], i64 [[INV3:%.*]], i64 [[INV4:%.*]], i64 [[INV5:%.*]]) {
 ; LOCAL_CSE-NEXT:  bb1:
-; LOCAL_CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw i64 [[INV2]], [[INV1]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw nsw i64 [[INV2]], [[INV1]]
 ; LOCAL_CSE-NEXT:    br label [[BB2:%.*]]
 ; LOCAL_CSE:       bb2:
 ; LOCAL_CSE-NEXT:    [[VAL_BB2:%.*]] = call i64 @get_val()
-; LOCAL_CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV4]]
-; LOCAL_CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw i64 [[CHAIN_A1]], [[VAL_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_B1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV5]]
-; LOCAL_CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw i64 [[CHAIN_B1]], [[VAL_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_C0:%.*]] = add nuw i64 [[INV3]], [[INV1]]
-; LOCAL_CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_C0]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV4]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_B1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV5]]
+; LOCAL_CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_B1]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_C0:%.*]] = add nuw nsw i64 [[INV3]], [[INV1]]
+; LOCAL_CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_C0]], [[VAL_BB2]]
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_A2]])
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_B2]])
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_C1]])
@@ -47,11 +47,11 @@ define void @chain_spanning_several_blocks(i64 %inv1, i64 %inv2, i64 %inv3, i64
 ; CSE-NEXT:    br label [[BB2:%.*]]
 ; CSE:       bb2:
 ; CSE-NEXT:    [[VAL_BB2:%.*]] = call i64 @get_val()
-; CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw i64 [[VAL_BB2]], [[INV1]]
-; CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV2]]
+; CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw nsw i64 [[VAL_BB2]], [[INV1]]
+; CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV2]]
 ; CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV4]]
 ; CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV5]]
-; CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV3]]
+; CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV3]]
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_A2]])
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_B2]])
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_C1]])
@@ -90,19 +90,19 @@ define void @chain_spanning_several_blocks_no_entry_anchor() {
 ; LOCAL_CSE-NEXT:    br label [[BB1:%.*]]
 ; LOCAL_CSE:       bb1:
 ; LOCAL_CSE-NEXT:    [[INV1_BB1:%.*]] = call i64 @get_val()
-; LOCAL_CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw i64 [[INV1_BB1]], [[INV2_BB0]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw nsw i64 [[INV1_BB1]], [[INV2_BB0]]
 ; LOCAL_CSE-NEXT:    br label [[BB2:%.*]]
 ; LOCAL_CSE:       bb2:
 ; LOCAL_CSE-NEXT:    [[INV3_BB2:%.*]] = call i64 @get_val()
 ; LOCAL_CSE-NEXT:    [[INV4_BB2:%.*]] = call i64 @get_val()
 ; LOCAL_CSE-NEXT:    [[INV5_BB2:%.*]] = call i64 @get_val()
 ; LOCAL_CSE-NEXT:    [[VAL_BB2:%.*]] = call i64 @get_val()
-; LOCAL_CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV4_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw i64 [[CHAIN_A1]], [[VAL_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_B1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV5_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw i64 [[CHAIN_B1]], [[VAL_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_C0:%.*]] = add nuw i64 [[VAL_BB2]], [[INV1_BB1]]
-; LOCAL_CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_C0]], [[INV3_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV4_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_B1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV5_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_B1]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_C0:%.*]] = add nuw nsw i64 [[VAL_BB2]], [[INV1_BB1]]
+; LOCAL_CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_C0]], [[INV3_BB2]]
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_A2]])
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_B2]])
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_C1]])
@@ -120,11 +120,11 @@ define void @chain_spanning_several_blocks_no_entry_anchor() {
 ; CSE-NEXT:    [[INV4_BB2:%.*]] = call i64 @get_val()
 ; CSE-NEXT:    [[INV5_BB2:%.*]] = call i64 @get_val()
 ; CSE-NEXT:    [[VAL_BB2:%.*]] = call i64 @get_val()
-; CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw i64 [[VAL_BB2]], [[INV1_BB1]]
-; CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV2_BB0]]
+; CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw nsw i64 [[VAL_BB2]], [[INV1_BB1]]
+; CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV2_BB0]]
 ; CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV4_BB2]]
 ; CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV5_BB2]]
-; CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV3_BB2]]
+; CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV3_BB2]]
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_A2]])
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_B2]])
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_C1]])
diff --git a/llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll b/llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll
new file mode 100644
index 00000000000000..fcebc4980e6d7d
--- /dev/null
+++ b/llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=reassociate -S | FileCheck %s
+define i32 @nsw_preserve_nonnegative(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: define i32 @nsw_preserve_nonnegative(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[PTR0]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[PTR1]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[PTR2]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = add nsw i32 [[V1]], [[V0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[ADD0]], [[V2]]
+; CHECK-NEXT:    ret i32 [[ADD1]]
+;
+  %v0 = load i32, ptr %ptr0, !range !1
+  %v1 = load i32, ptr %ptr1, !range !1
+  %v2 = load i32, ptr %ptr2, !range !1
+  %add0 = add nsw i32 %v1, %v2
+  %add1 = add nsw i32 %add0, %v0
+  ret i32 %add1
+}
+
+define i32 @nsw_preserve_nuw_nsw(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: define i32 @nsw_preserve_nuw_nsw(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[PTR0]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[PTR1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[PTR2]], align 4
+; CHECK-NEXT:    [[ADD0:%.*]] = add nuw nsw i32 [[V1]], [[V0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i32 [[ADD0]], [[V2]]
+; CHECK-NEXT:    ret i32 [[ADD1]]
+;
+  %v0 = load i32, ptr %ptr0
+  %v1 = load i32, ptr %ptr1
+  %v2 = load i32, ptr %ptr2
+  %add0 = add nuw nsw i32 %v1, %v2
+  %add1 = add nuw nsw i32 %add0, %v0
+  ret i32 %add1
+}
+
+define i32 @nsw_dont_preserve_negative(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: define i32 @nsw_dont_preserve_negative(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[PTR0]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[PTR1]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[PTR2]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = add i32 [[V1]], [[V0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ADD0]], [[V2]]
+; CHECK-NEXT:    ret i32 [[ADD1]]
+;
+  %v0 = load i32, ptr %ptr0
+  %v1 = load i32, ptr %ptr1, !range !1
+  %v2 = load i32, ptr %ptr2, !range !1
+  %add0 = add nsw i32 %v1, %v2
+  %add1 = add nsw i32 %add0, %v0
+  ret i32 %add1
+}
+
+define i32 @nsw_nopreserve_notallnsw(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: define i32 @nsw_nopreserve_notallnsw(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[PTR0]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[PTR1]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[PTR2]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = add i32 [[V1]], [[V0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ADD0]], [[V2]]
+; CHECK-NEXT:    ret i32 [[ADD1]]
+;
+  %v0 = load i32, ptr %ptr0, !range !1
+  %v1 = load i32, ptr %ptr1, !range !1
+  %v2 = load i32, ptr %ptr2, !range !1
+  %add0 = add nsw i32 %v1, %v2
+  %add1 = add i32 %add0, %v0
+  ret i32 %add1
+}
+
+; Positive 32 bit integers
+!1 = !{i32 0, i32 2147483648}
+;.
+; CHECK: [[RNG0]] = !{i32 0, i32 -2147483648}
+;.

From 99835922ca2a2ac20769271a49a5f8055bb5dc93 Mon Sep 17 00:00:00 2001
From: Peiming Liu <peiming@google.com>
Date: Tue, 28 May 2024 11:23:15 -0700
Subject: [PATCH 44/89] [mlir][sparse] remove sparse encoding propagation pass.
 (#93593)

---
 .../Dialect/SparseTensor/Transforms/Passes.h  |  6 ----
 .../Dialect/SparseTensor/Transforms/Passes.td | 36 -------------------
 .../Transforms/SparseTensorPasses.cpp         | 13 -------
 3 files changed, 55 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index bb49d6c256f21b..d6d038ef65bdf4 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -65,12 +65,6 @@ void populateSparseAssembler(RewritePatternSet &patterns, bool directOut);
 std::unique_ptr<Pass> createSparseAssembler();
 std::unique_ptr<Pass> createSparseAssembler(bool directOut);
 
-//===----------------------------------------------------------------------===//
-// The SparseEncodingPropagation pass.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<Pass> createSparseEncodingPropagationPass();
-
 //===----------------------------------------------------------------------===//
 // The SparseReinterpretMap pass.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 94c3ca60030eeb..2f844cee5ff528 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -40,42 +40,6 @@ def SparseAssembler : Pass<"sparse-assembler", "ModuleOp"> {
   ];
 }
 
-def SparseEncodingPropagation : Pass<"sparse-encoding-propagation", "func::FuncOp"> {
-  let summary = "Propagate sparse tensor encodings";
-  let description = [{
-    A pass that propagates sparse tensor encodings.
-
-    Background: To avoid introducing repetitive operations, sparse tensors
-    in MLIR try to reuse tensor operations whenever available. However, most
-    tensor operations are canonicalized/transformed without the knowledge
-    of sparsity. The pass tries to propagate missing sparse encodings.
-
-    For example:
-    ```mlir
-    %s = tensor.extract_slice %input[0, 0,] [2, 1] [1, 1]
-       : tensor<2x3xf32, #sparse> to tensor<2x1xf32, #sparse>
-
-    // After rank reducing (by tensor dialect transformation)
-    %t = tensor.extract_slice %input[0, 0,] [2, 1] [1, 1]
-       : tensor<2x3xf32, #sparse> to tensor<2xf32>
-    %s = tensor.expand_shape [[0, 1]] %t
-       : tensor<2xf32> to tensor<2x1xf32, #sparse>
-
-    // After sparsity propagation
-    %t = tensor.extract_slice %input[0, 0,] [2, 1] [1, 1]
-       : tensor<2x3xf32, #sparse> to tensor<2xf32, #sparse1>
-    %s = tensor.expand_shape [[0, 1]] %t
-       : tensor<2xf32, #sparse1> to tensor<2x1xf32, #sparse>
-    ```
-  }];
-
-  let constructor = "mlir::createSparseEncodingPropagationPass()";
-  let dependentDialects = [
-    "sparse_tensor::SparseTensorDialect",
-    "tensor::TensorDialect",
-  ];
-}
-
 def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> {
   let summary = "Reinterprets sparse tensor type mappings";
   let description = [{
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index f57353b5892b5a..b42d58634a36c4 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -23,7 +23,6 @@
 
 namespace mlir {
 #define GEN_PASS_DEF_SPARSEASSEMBLER
-#define GEN_PASS_DEF_SPARSEENCODINGPROPAGATION
 #define GEN_PASS_DEF_SPARSEREINTERPRETMAP
 #define GEN_PASS_DEF_PRESPARSIFICATIONREWRITE
 #define GEN_PASS_DEF_SPARSIFICATIONPASS
@@ -61,14 +60,6 @@ struct SparseAssembler : public impl::SparseAssemblerBase<SparseAssembler> {
   }
 };
 
-struct SparseEncodingPropagation
-    : public impl::SparseEncodingPropagationBase<SparseEncodingPropagation> {
-  SparseEncodingPropagation() = default;
-  SparseEncodingPropagation(const SparseEncodingPropagation &pass) = default;
-
-  void runOnOperation() override {}
-};
-
 struct SparseReinterpretMap
     : public impl::SparseReinterpretMapBase<SparseReinterpretMap> {
   SparseReinterpretMap() = default;
@@ -407,10 +398,6 @@ std::unique_ptr<Pass> mlir::createSparseAssembler() {
   return std::make_unique<SparseAssembler>();
 }
 
-std::unique_ptr<Pass> mlir::createSparseEncodingPropagationPass() {
-  return std::make_unique<SparseEncodingPropagation>();
-}
-
 std::unique_ptr<Pass> mlir::createSparseReinterpretMapPass() {
   return std::make_unique<SparseReinterpretMap>();
 }

From 196a0809822ba4ac0fc669a46cbacee8afbe36c2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 28 May 2024 20:27:07 +0200
Subject: [PATCH 45/89] DAG: Handle fminnum_ieee/fmaxnum_ieee in basic
 legalization

Handle these in promote float and vector widening. Currently we happen
to avoid emitting these unless legal or custom. Avoids regression in
a future commit which wants to unconditionally emit these.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp  | 2 ++
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index fc96ecdc662808..fb1424f75e097d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2488,6 +2488,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FMINIMUM:
     case ISD::FMAXNUM:
     case ISD::FMINNUM:
+    case ISD::FMAXNUM_IEEE:
+    case ISD::FMINNUM_IEEE:
     case ISD::FMUL:
     case ISD::FPOW:
     case ISD::FREM:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 40e621f0db2209..14e8708fd3f38f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4237,8 +4237,12 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SHL: case ISD::VP_SHL:
   case ISD::SRA: case ISD::VP_SRA:
   case ISD::SRL: case ISD::VP_SRL:
-  case ISD::FMINNUM: case ISD::VP_FMINNUM:
-  case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
+  case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::VP_FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::VP_FMAXNUM:
   case ISD::FMINIMUM:
   case ISD::VP_FMINIMUM:
   case ISD::FMAXIMUM:

From 08de0b3cf54e4998799673f835e9a7d5ead8efab Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 28 May 2024 11:36:48 -0700
Subject: [PATCH 46/89] [WebAssembly] Add tests for EH/SjLj option errors
 (#93583)

This adds tests for EH/SjLj option errors and swaps the error checking
order for unimportant cosmetic reasons (I think checking EH/SjLj
conflicts is more important than the model checking)
---
 .../WebAssembly/WebAssemblyTargetMachine.cpp  | 34 ++++++++++---------
 .../CodeGen/WebAssembly/eh-option-errors.ll   | 19 +++++++++++
 .../WebAssembly/lower-em-ehsjlj-options.ll    |  3 --
 3 files changed, 37 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/WebAssembly/eh-option-errors.ll

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index de342e89657367..68126992ddcd72 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -388,15 +388,29 @@ using WebAssembly::WasmEnableEmSjLj;
 using WebAssembly::WasmEnableSjLj;
 
 static void basicCheckForEHAndSjLj(TargetMachine *TM) {
-  // Before checking, we make sure TargetOptions.ExceptionModel is the same as
+
+  // You can't enable two modes of EH at the same time
+  if (WasmEnableEmEH && WasmEnableEH)
+    report_fatal_error(
+        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh");
+  // You can't enable two modes of SjLj at the same time
+  if (WasmEnableEmSjLj && WasmEnableSjLj)
+    report_fatal_error(
+        "-enable-emscripten-sjlj not allowed with -wasm-enable-sjlj");
+  // You can't mix Emscripten EH with Wasm SjLj.
+  if (WasmEnableEmEH && WasmEnableSjLj)
+    report_fatal_error(
+        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj");
+
+  // Here we make sure TargetOptions.ExceptionModel is the same as
   // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang
   // stores the exception model info in LangOptions, which is later transferred
   // to TargetOptions and MCAsmInfo. But when clang compiles bitcode directly,
   // clang's LangOptions is not used and thus the exception model info is not
   // correctly transferred to TargetOptions and MCAsmInfo, so we make sure we
-  // have the correct exception model in WebAssemblyMCAsmInfo constructor.
-  // But in this case TargetOptions is still not updated, so we make sure they
-  // are the same.
+  // have the correct exception model in WebAssemblyMCAsmInfo constructor. But
+  // in this case TargetOptions is still not updated, so we make sure they are
+  // the same.
   TM->Options.ExceptionModel = TM->getMCAsmInfo()->getExceptionHandlingType();
 
   // Basic Correctness checking related to -exception-model
@@ -418,18 +432,6 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) {
         "-exception-model=wasm only allowed with at least one of "
         "-wasm-enable-eh or -wasm-enable-sjlj");
 
-  // You can't enable two modes of EH at the same time
-  if (WasmEnableEmEH && WasmEnableEH)
-    report_fatal_error(
-        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh");
-  // You can't enable two modes of SjLj at the same time
-  if (WasmEnableEmSjLj && WasmEnableSjLj)
-    report_fatal_error(
-        "-enable-emscripten-sjlj not allowed with -wasm-enable-sjlj");
-  // You can't mix Emscripten EH with Wasm SjLj.
-  if (WasmEnableEmEH && WasmEnableSjLj)
-    report_fatal_error(
-        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj");
   // Currently it is allowed to mix Wasm EH with Emscripten SjLj as an interim
   // measure, but some code will error out at compile time in this combination.
   // See WebAssemblyLowerEmscriptenEHSjLj pass for details.
diff --git a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
new file mode 100644
index 00000000000000..74d02ddc405d3f
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
@@ -0,0 +1,19 @@
+target triple = "wasm32-unknown-unknown"
+
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-eh 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_EH
+; EM_EH_W_WASM_EH: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh
+
+; RUN: not --crash llc < %s -enable-emscripten-sjlj -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_SJLJ_W_WASM_SJLJ
+; EM_SJLJ_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-sjlj not allowed with -wasm-enable-sjlj
+
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_SJLJ
+; EM_EH_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj
+
+; RUN: not --crash llc < %s -wasm-enable-eh -exception-model=dwarf 2>&1 | FileCheck %s --check-prefix=EH_MODEL_DWARF
+; EH_MODEL_DWARF: LLVM ERROR: -exception-model should be either 'none' or 'wasm'
+
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -exception-model=wasm 2>&1 | FileCheck %s --check-prefix=EM_EH_W_MODEL_WASM
+; EM_EH_W_MODEL_WASM: LLVM ERROR: -exception-model=wasm not allowed with -enable-emscripten-cxx-exceptions
+
+; RUN: not --crash llc < %s -exception-model=wasm 2>&1 | FileCheck %s --check-prefix=MODEL_WASM_WO_WASM_EH_SJLJ
+; MODEL_WASM_WO_WASM_EH_SJLJ: LLVM ERROR: -exception-model=wasm only allowed with at least one of -wasm-enable-eh or -wasm-enable-sjlj
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll
index 4a63c812d6ae9a..66872a54229862 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll
@@ -1,7 +1,6 @@
 ; RUN: llc < %s -enable-emscripten-cxx-exceptions | FileCheck %s --check-prefix=EH
 ; RUN: llc < %s -enable-emscripten-sjlj | FileCheck %s --check-prefix=SJLJ
 ; RUN: llc < %s | FileCheck %s --check-prefix=NONE
-; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -exception-model=wasm 2>&1 | FileCheck %s --check-prefix=WASM-EH-EM-EH
 
 target triple = "wasm32-unknown-unknown"
 
@@ -97,5 +96,3 @@ declare void @free(ptr)
 attributes #0 = { returns_twice }
 attributes #1 = { noreturn }
 attributes #2 = { nounwind }
-
-; WASM-EH-EM-EH: LLVM ERROR: -exception-model=wasm not allowed with -enable-emscripten-cxx-exceptions

From d33864d5d8ae55ff1c86510dc475fd9dd72d61c7 Mon Sep 17 00:00:00 2001
From: Karthika Devi C <quic_kartc@quicinc.com>
Date: Wed, 29 May 2024 00:11:58 +0530
Subject: [PATCH 47/89] [polly] Fix cppcheck SA comment reported in #91235
 (#93505)

This patch moves the unreachable assert before return statement.
Fixes #91235.
---
 polly/include/polly/ScheduleTreeTransform.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polly/include/polly/ScheduleTreeTransform.h b/polly/include/polly/ScheduleTreeTransform.h
index ee504c4e5f5244..6bd5a3abf9ea28 100644
--- a/polly/include/polly/ScheduleTreeTransform.h
+++ b/polly/include/polly/ScheduleTreeTransform.h
@@ -47,9 +47,9 @@ struct ScheduleTreeVisitor {
       return getDerived().visitSequence(Node.as<isl::schedule_node_sequence>(),
                                         std::forward<Args>(args)...);
     case isl_schedule_node_set:
+      assert(isl_schedule_node_n_children(Node.get()) >= 2);
       return getDerived().visitSet(Node.as<isl::schedule_node_set>(),
                                    std::forward<Args>(args)...);
-      assert(isl_schedule_node_n_children(Node.get()) >= 2);
     case isl_schedule_node_leaf:
       assert(isl_schedule_node_n_children(Node.get()) == 0);
       return getDerived().visitLeaf(Node.as<isl::schedule_node_leaf>(),

From 5901d4005f015a46185ddc080038c1a3db3fa2c7 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 28 May 2024 14:55:18 -0400
Subject: [PATCH 48/89] [C] Disallow declarations where a statement is required
 (#92908)

This fixes a regression introduced in
8bd06d5b65845e5e01dd899a2deb773580460b89 where Clang began to accept a
declaration where a statement is required. e.g.,
```
if (1)
  int x; // Previously accepted, now properly rejected
```

Fixes #92775
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/include/clang/Parse/Parser.h            |  9 +++--
 clang/lib/Parse/ParseStmt.cpp                 | 10 ++++-
 clang/test/C/C99/block-scopes.c               |  3 +-
 clang/test/Parser/decls.c                     | 39 +++++++++++++++++++
 .../test/SemaOpenACC/parallel-loc-and-stmt.c  |  6 ++-
 6 files changed, 63 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/Parser/decls.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 894f6b04431744..9091f6341bd9b8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -632,6 +632,9 @@ Bug Fixes in This Version
 - ``__is_array`` and ``__is_bounded_array`` no longer return ``true`` for
   zero-sized arrays. Fixes (#GH54705).
 
+- Correctly reject declarations where a statement is required in C.
+  Fixes #GH92775
+
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 8493026f5f7a69..00b475e5b42824 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -467,15 +467,18 @@ class Parser : public CodeCompletionHandler {
 
   /// Flags describing a context in which we're parsing a statement.
   enum class ParsedStmtContext {
+    /// This context permits declarations in language modes where declarations
+    /// are not statements.
+    AllowDeclarationsInC = 0x1,
     /// This context permits standalone OpenMP directives.
-    AllowStandaloneOpenMPDirectives = 0x1,
+    AllowStandaloneOpenMPDirectives = 0x2,
     /// This context is at the top level of a GNU statement expression.
-    InStmtExpr = 0x2,
+    InStmtExpr = 0x4,
 
     /// The context of a regular substatement.
     SubStmt = 0,
     /// The context of a compound-statement.
-    Compound = AllowStandaloneOpenMPDirectives,
+    Compound = AllowDeclarationsInC | AllowStandaloneOpenMPDirectives,
 
     LLVM_MARK_AS_BITMASK_ENUM(InStmtExpr)
   };
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index b0af04451166ca..c25203243ee49b 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -239,7 +239,15 @@ StmtResult Parser::ParseStatementOrDeclarationAfterAttributes(
     auto IsStmtAttr = [](ParsedAttr &Attr) { return Attr.isStmtAttr(); };
     bool AllAttrsAreStmtAttrs = llvm::all_of(CXX11Attrs, IsStmtAttr) &&
                                 llvm::all_of(GNUAttrs, IsStmtAttr);
-    if (((GNUAttributeLoc.isValid() && !(HaveAttrs && AllAttrsAreStmtAttrs)) ||
+    // In C, the grammar production for statement (C23 6.8.1p1) does not allow
+    // for declarations, which is different from C++ (C++23 [stmt.pre]p1). So
+    // in C++, we always allow a declaration, but in C we need to check whether
+    // we're in a statement context that allows declarations. e.g., in C, the
+    // following is invalid: if (1) int x;
+    if ((getLangOpts().CPlusPlus || getLangOpts().MicrosoftExt ||
+         (StmtCtx & ParsedStmtContext::AllowDeclarationsInC) !=
+             ParsedStmtContext()) &&
+        ((GNUAttributeLoc.isValid() && !(HaveAttrs && AllAttrsAreStmtAttrs)) ||
          isDeclarationStatement())) {
       SourceLocation DeclStart = Tok.getLocation(), DeclEnd;
       DeclGroupPtrTy Decl;
diff --git a/clang/test/C/C99/block-scopes.c b/clang/test/C/C99/block-scopes.c
index 589047df3e52bc..116e5d922593e0 100644
--- a/clang/test/C/C99/block-scopes.c
+++ b/clang/test/C/C99/block-scopes.c
@@ -18,8 +18,9 @@
 
 enum {a, b};
 void different(void) {
-  if (sizeof(enum {b, a}) != sizeof(int))
+  if (sizeof(enum {b, a}) != sizeof(int)) {
     _Static_assert(a == 1, "");
+  }
   /* In C89, the 'b' found here would have been from the enum declaration in
    * the controlling expression of the selection statement, not from the global
    * declaration. In C99 and later, that enumeration is scoped to the 'if'
diff --git a/clang/test/Parser/decls.c b/clang/test/Parser/decls.c
new file mode 100644
index 00000000000000..39ef05bf4bd999
--- /dev/null
+++ b/clang/test/Parser/decls.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 %s -fsyntax-only -verify -pedantic
+
+// Test that we can parse declarations at global scope.
+int v;
+
+void func(void) {
+  // Test that we can parse declarations within a compound statement.
+  int a;
+  {
+    int b;
+  }
+
+  int z = ({ // expected-warning {{use of GNU statement expression extension}}
+	// Test that we can parse declarations within a GNU statement expression.
+	int w = 12;
+	w;
+  });
+
+  // Test that we diagnose declarations where a statement is required.
+  // See GH92775.
+  if (1)
+    int x; // expected-error {{expected expression}}
+  for (;;)
+    int c; // expected-error {{expected expression}}
+
+  label:
+    int y; // expected-warning {{label followed by a declaration is a C23 extension}}
+
+  // Test that lookup works as expected.
+  (void)a;
+  (void)v;
+  (void)z;
+  (void)b; // expected-error {{use of undeclared identifier 'b'}}
+  (void)w; // expected-error {{use of undeclared identifier 'w'}}
+  (void)x; // expected-error {{use of undeclared identifier 'x'}}
+  (void)c; // expected-error {{use of undeclared identifier 'c'}}
+  (void)y;
+}
+
diff --git a/clang/test/SemaOpenACC/parallel-loc-and-stmt.c b/clang/test/SemaOpenACC/parallel-loc-and-stmt.c
index ba29f6da8ba25d..bbcdd823483a52 100644
--- a/clang/test/SemaOpenACC/parallel-loc-and-stmt.c
+++ b/clang/test/SemaOpenACC/parallel-loc-and-stmt.c
@@ -33,9 +33,11 @@ int foo3;
 
 void func() {
   // FIXME: Should we disallow this on declarations, or consider this to be on
-  // the initialization?
+  // the initialization? This is currently rejected in C because
+  // Parser::ParseOpenACCDirectiveStmt() calls ParseStatement() and passes the
+  // statement context as "SubStmt" which does not allow for a declaration in C.
 #pragma acc parallel
-  int foo;
+  int foo; // expected-error {{expected expression}}
 
 #pragma acc parallel
   {

From debdbeda15802900615d1bee83e4fc519abeaba6 Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Tue, 28 May 2024 20:04:27 +0100
Subject: [PATCH 49/89] [mlir] Remove dialect specific bufferization passes
 (Reland) (#93535)

These passes have been depreciated for a long time and replaced by
one-shot bufferization. These passes are also unsafe because they do not
check for read-after-write conflicts.

Relands https://github.com/llvm/llvm-project/pull/93488 which failed on
buildbot. Fixes the failure by updating integration tests to use
one-shot-bufferize instead.
---
 .../mlir/Dialect/Arith/Transforms/Passes.h    |  3 -
 .../mlir/Dialect/Arith/Transforms/Passes.td   | 16 -----
 .../Dialect/Bufferization/Transforms/Passes.h |  3 -
 .../Bufferization/Transforms/Passes.td        |  5 --
 mlir/include/mlir/Dialect/Linalg/Passes.h     |  4 --
 mlir/include/mlir/Dialect/Linalg/Passes.td    | 10 ---
 .../mlir/Dialect/Shape/Transforms/Passes.h    |  7 --
 .../mlir/Dialect/Shape/Transforms/Passes.td   |  7 --
 .../mlir/Dialect/Tensor/Transforms/Passes.h   |  3 -
 .../mlir/Dialect/Tensor/Transforms/Passes.td  |  5 --
 .../mlir/Dialect/Vector/Transforms/Passes.h   |  3 -
 .../mlir/Dialect/Vector/Transforms/Passes.td  |  5 --
 .../Dialect/Arith/Transforms/Bufferize.cpp    | 67 -------------------
 .../Dialect/Arith/Transforms/CMakeLists.txt   |  1 -
 .../Bufferization/Transforms/Bufferize.cpp    | 23 -------
 .../Dialect/Linalg/Transforms/Bufferize.cpp   | 52 --------------
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |  1 -
 .../Dialect/Shape/Transforms/Bufferize.cpp    | 49 --------------
 .../Dialect/Shape/Transforms/CMakeLists.txt   |  1 -
 .../Dialect/Tensor/Transforms/Bufferize.cpp   | 58 ----------------
 .../Dialect/Tensor/Transforms/CMakeLists.txt  |  1 -
 .../Dialect/Vector/Transforms/Bufferize.cpp   | 55 ---------------
 .../Dialect/Vector/Transforms/CMakeLists.txt  |  1 -
 mlir/test/Dialect/Arith/bufferize.mlir        |  8 +--
 mlir/test/Dialect/Linalg/bufferize.mlir       | 30 +--------
 mlir/test/Dialect/Shape/bufferize.mlir        |  2 +-
 .../Dialect/SparseTensor/sparse_lower.mlir    |  3 +-
 .../SparseTensor/sparse_lower_col.mlir        |  3 +-
 .../SparseTensor/sparse_lower_inplace.mlir    |  3 +-
 mlir/test/Dialect/Tensor/bufferize.mlir       |  2 +-
 .../Dialect/Vector/bufferize-invalid.mlir     |  3 +-
 mlir/test/Dialect/Vector/bufferize.mlir       |  2 +-
 .../Dialect/Complex/CPU/correctness.mlir      |  2 +-
 .../Linalg/CPU/test-collapse-tensor.mlir      |  6 +-
 .../Dialect/Linalg/CPU/test-elementwise.mlir  |  2 +-
 .../Linalg/CPU/test-expand-tensor.mlir        |  6 +-
 .../Dialect/Linalg/CPU/test-padtensor.mlir    |  3 +-
 .../test-subtensor-insert-multiple-uses.mlir  |  4 +-
 .../Linalg/CPU/test-subtensor-insert.mlir     |  4 +-
 .../Dialect/Linalg/CPU/test-tensor-e2e.mlir   |  5 +-
 .../Linalg/CPU/test-tensor-matmul.mlir        | 10 +--
 .../Dialect/Memref/print-memref.mlir          |  2 +-
 .../Dialect/Memref/verify-memref.mlir         |  2 +-
 .../Vector/CPU/AMX/test-mulf-full.mlir        |  5 +-
 .../Vector/CPU/AMX/test-muli-full.mlir        |  6 +-
 45 files changed, 40 insertions(+), 453 deletions(-)
 delete mode 100644 mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp
 delete mode 100644 mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
 delete mode 100644 mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp
 delete mode 100644 mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp
 delete mode 100644 mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp

diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
index cbc6147cb81e22..9dc262cc72ed00 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
@@ -24,9 +24,6 @@ namespace arith {
 class WideIntEmulationConverter;
 class NarrowTypeEmulationConverter;
 
-/// Create a pass to bufferize arith.constant ops.
-std::unique_ptr<Pass> createConstantBufferizePass(uint64_t alignment = 0);
-
 /// Adds patterns to emulate wide Arith and Function ops over integer
 /// types into supported ones. This is done by splitting original power-of-two
 /// i2N integer types into two iN halves.
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
index 4096e309199e98..550c5c0cf4f60f 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
@@ -11,22 +11,6 @@
 
 include "mlir/Pass/PassBase.td"
 
-def ArithBufferizePass : Pass<"arith-bufferize", "ModuleOp"> {
-  let summary = "Bufferize Arith dialect ops.";
-  let description = [{
-    This pass bufferizes arith dialect ops.
-
-    This pass needs to be a module pass because it inserts memref.global
-    ops into the module, which cannot be done safely from a function pass due to
-    multi-threading. Most other bufferization passes can run in parallel at
-    function granularity.
-  }];
-  let options = [
-    Option<"alignment", "alignment", "unsigned", /*default=*/"0",
-           "Create global memrefs with a specified alignment">,
-  ];
-}
-
 def ArithExpandOpsPass : Pass<"arith-expand"> {
   let summary = "Legalize Arith ops to be convertible to LLVM.";
   let dependentDialects = ["vector::VectorDialect"];
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index 459c252b707121..e053e6c97e1430 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -221,9 +221,6 @@ createPromoteBuffersToStackPass(std::function<bool(Value)> isSmallAlloc);
 /// insert_slice ops.
 std::unique_ptr<Pass> createEmptyTensorEliminationPass();
 
-/// Create a pass that bufferizes ops from the bufferization dialect.
-std::unique_ptr<Pass> createBufferizationBufferizePass();
-
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index 75ce85c9128c94..8f8826b9ad56b4 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -350,11 +350,6 @@ def FinalizingBufferize : Pass<"finalizing-bufferize", "func::FuncOp"> {
   let constructor = "mlir::bufferization::createFinalizingBufferizePass()";
 }
 
-def BufferizationBufferize : Pass<"bufferization-bufferize", "func::FuncOp"> {
-  let summary = "Bufferize the `bufferization` dialect";
-  let constructor = "mlir::bufferization::createBufferizationBufferizePass()";
-}
-
 def DropEquivalentBufferResults : Pass<"drop-equivalent-buffer-results", "ModuleOp">  {
   let summary = "Remove MemRef return values that are equivalent to a bbArg";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index d36d1e70f0b14d..f2955d55e59eca 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -22,10 +22,6 @@ namespace func {
 class FuncOp;
 } // namespace func
 
-namespace bufferization {
-struct OneShotBufferizationOptions;
-} // namespace bufferization
-
 #define GEN_PASS_DECL
 #include "mlir/Dialect/Linalg/Passes.h.inc" // IWYU pragma: keep
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 0a4ce8953136dd..0621a9f33ba1e8 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -89,16 +89,6 @@ def LinalgInlineScalarOperandsPass : Pass<"linalg-inline-scalar-operands"> {
   ];
 }
 
-def LinalgBufferizePass : Pass<"linalg-bufferize"> {
-  let summary = "Bufferize the linalg dialect";
-  let dependentDialects = [
-    "affine::AffineDialect",
-    "bufferization::BufferizationDialect",
-    "linalg::LinalgDialect",
-    "memref::MemRefDialect",
-  ];
-}
-
 def LinalgGeneralizeNamedOpsPass : Pass<"linalg-generalize-named-ops"> {
   let summary = "Convert named ops into generic ops";
   let dependentDialects = ["linalg::LinalgDialect"];
diff --git a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
index cfb637f133f54c..28e17459ff9625 100644
--- a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
@@ -47,13 +47,6 @@ void populateShapeRewritePatterns(RewritePatternSet &patterns);
 void populateRemoveShapeConstraintsPatterns(RewritePatternSet &patterns);
 std::unique_ptr<OperationPass<func::FuncOp>> createRemoveShapeConstraintsPass();
 
-// Bufferizes shape dialect ops.
-//
-// Note that most shape dialect ops must be converted to std before
-// bufferization happens, as they are intended to be bufferized at the std
-// level.
-std::unique_ptr<OperationPass<func::FuncOp>> createShapeBufferizePass();
-
 /// Outline the shape computation part by adding shape.func and populate
 /// conrresponding mapping infomation into ShapeMappingAnalysis.
 std::unique_ptr<OperationPass<ModuleOp>> createOutlineShapeComputationPass();
diff --git a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td
index 9dfda9ea336153..83834509b4a35a 100644
--- a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td
@@ -103,11 +103,4 @@ def ShapeToShapeLowering : Pass<"shape-to-shape-lowering", "func::FuncOp"> {
   let constructor = "mlir::createShapeToShapeLowering()";
 }
 
-// TODO: Generalize this to allow any type conversions desired.
-def ShapeBufferize : Pass<"shape-bufferize", "func::FuncOp"> {
-  let summary = "Bufferize the shape dialect.";
-  let constructor = "mlir::createShapeBufferizePass()";
-  let dependentDialects = ["bufferization::BufferizationDialect",
-                           "memref::MemRefDialect"];
-}
 #endif // MLIR_DIALECT_SHAPE_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
index 48f9066934a25e..964c35b3f15b80 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
@@ -21,9 +21,6 @@ namespace tensor {
 /// Creates an instance of the `tensor` subset folding pass.
 std::unique_ptr<Pass> createFoldTensorSubsetOpsPass();
 
-/// Creates an instance of the `tensor` dialect bufferization pass.
-std::unique_ptr<Pass> createTensorBufferizePass();
-
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td
index 4cc3844f29120b..be4c333836ec07 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td
@@ -27,9 +27,4 @@ def FoldTensorSubsetOps : Pass<"fold-tensor-subset-ops"> {
   ];
 }
 
-def TensorBufferize : Pass<"tensor-bufferize", "func::FuncOp"> {
-  let summary = "Bufferize the `tensor` dialect";
-  let constructor = "mlir::tensor::createTensorBufferizePass()";
-}
-
 #endif // MLIR_DIALECT_TENSOR_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
index 911402551e14d4..5667f4fa95ace4 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
@@ -17,9 +17,6 @@ namespace vector {
 #define GEN_PASS_DECL
 #include "mlir/Dialect/Vector/Transforms/Passes.h.inc"
 
-/// Creates an instance of the `vector` dialect bufferization pass.
-std::unique_ptr<Pass> createVectorBufferizePass();
-
 /// Creates an instance of the `vector.mask` lowering pass.
 std::unique_ptr<Pass> createLowerVectorMaskPass();
 
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td
index 31a0b3b2f0c53d..74369987497910 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td
@@ -11,11 +11,6 @@
 
 include "mlir/Pass/PassBase.td"
 
-def VectorBufferize : Pass<"vector-bufferize", "func::FuncOp"> {
-  let summary = "Bufferize Vector dialect ops";
-  let constructor = "mlir::vector::createVectorBufferizePass()";
-}
-
 def LowerVectorMaskPass : Pass<"lower-vector-mask", "func::FuncOp"> {
   let summary = "Lower 'vector.mask' operations";
   let constructor = "mlir::vector::createLowerVectorMaskPass()";
diff --git a/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp
deleted file mode 100644
index 9a066756f429ca..00000000000000
--- a/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===- Bufferize.cpp - Bufferization for Arith ops ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-
-namespace mlir {
-namespace arith {
-#define GEN_PASS_DEF_ARITHBUFFERIZEPASS
-#include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
-} // namespace arith
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-/// Pass to bufferize Arith ops.
-struct ArithBufferizePass
-    : public arith::impl::ArithBufferizePassBase<ArithBufferizePass> {
-  using ArithBufferizePassBase::ArithBufferizePassBase;
-
-  ArithBufferizePass(uint64_t alignment = 0, bool constantOpOnly = false)
-      : constantOpOnly(constantOpOnly) {
-    this->alignment = alignment;
-  }
-
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    if (constantOpOnly) {
-      options.opFilter.allowOperation<arith::ConstantOp>();
-    } else {
-      options.opFilter.allowDialect<arith::ArithDialect>();
-    }
-    options.bufferAlignment = alignment;
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                    arith::ArithDialect>();
-    arith::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-
-private:
-  bool constantOpOnly;
-};
-} // namespace
-
-std::unique_ptr<Pass>
-mlir::arith::createConstantBufferizePass(uint64_t alignment) {
-  return std::make_unique<ArithBufferizePass>(alignment,
-                                              /*constantOpOnly=*/true);
-}
diff --git a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
index 12659eaba1fa5e..6b8bde8dc2aaf3 100644
--- a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_mlir_dialect_library(MLIRArithTransforms
   BufferDeallocationOpInterfaceImpl.cpp
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   BufferViewFlowOpInterfaceImpl.cpp
   EmulateUnsupportedFloats.cpp
   EmulateWideInt.cpp
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 7ba347a1f15e47..0fddd60eb8140e 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -320,29 +320,6 @@ struct OneShotBufferizePass
 };
 } // namespace
 
-namespace {
-struct BufferizationBufferizePass
-    : public bufferization::impl::BufferizationBufferizeBase<
-          BufferizationBufferizePass> {
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<BufferizationDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<bufferization::BufferizationDialect, memref::MemRefDialect>();
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::bufferization::createBufferizationBufferizePass() {
-  return std::make_unique<BufferizationBufferizePass>();
-}
-
 std::unique_ptr<Pass> mlir::bufferization::createOneShotBufferizePass() {
   return std::make_unique<OneShotBufferizePass>();
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
deleted file mode 100644
index 8812ca14ba6109..00000000000000
--- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===- Bufferize.cpp - Bufferization of linalg ops ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Linalg/Passes.h"
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BuiltinDialect.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_LINALGBUFFERIZEPASS
-#include "mlir/Dialect/Linalg/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-/// Converts Linalg operations that work on tensor-type operands or results to
-/// work on buffers.
-struct LinalgBufferizePass
-    : public impl::LinalgBufferizePassBase<LinalgBufferizePass> {
-  using impl::LinalgBufferizePassBase<
-      LinalgBufferizePass>::LinalgBufferizePassBase;
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<linalg::LinalgDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                    tensor::TensorDialect, linalg::LinalgDialect>();
-    linalg::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-};
-} // namespace
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index ed9f40089282a6..7e3dc56e0acdc9 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -2,7 +2,6 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   AllInterfaces.cpp
   BubbleUpExtractSlice.cpp
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   ConstantFold.cpp
   ConvertToDestinationStyle.cpp
   ConvertConv2DToImg2Col.cpp
diff --git a/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp
deleted file mode 100644
index 9dadbdbc91eca9..00000000000000
--- a/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//====----- Bufferize.cpp - Bufferization of shape ops  ---------*- C++-*--===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Shape/Transforms/Passes.h"
-
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_SHAPEBUFFERIZE
-#include "mlir/Dialect/Shape/Transforms/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-struct ShapeBufferizePass
-    : public impl::ShapeBufferizeBase<ShapeBufferizePass> {
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<shape::ShapeDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                    shape::ShapeDialect>();
-    shape::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-};
-} // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> mlir::createShapeBufferizePass() {
-  return std::make_unique<ShapeBufferizePass>();
-}
diff --git a/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt
index 7c9b0d2e5e3a8e..a51c6780c28665 100644
--- a/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_mlir_dialect_library(MLIRShapeOpsTransforms
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   OutlineShapeComputation.cpp
   RemoveShapeConstraints.cpp
   ShapeToShapeLowering.cpp
diff --git a/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp
deleted file mode 100644
index d27c4576a8b7a9..00000000000000
--- a/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===- Bufferize.cpp - Bufferization for `tensor` dialect ops -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements bufferization of `tensor` dialect ops
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/Transforms/Passes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace tensor {
-#define GEN_PASS_DEF_TENSORBUFFERIZE
-#include "mlir/Dialect/Tensor/Transforms/Passes.h.inc"
-} // namespace tensor
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-struct TensorBufferizePass
-    : public tensor::impl::TensorBufferizeBase<TensorBufferizePass> {
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<tensor::TensorDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                tensor::TensorDialect, scf::SCFDialect, arith::ArithDialect>();
-    tensor::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::tensor::createTensorBufferizePass() {
-  return std::make_unique<TensorBufferizePass>();
-}
diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
index 0aabdaf667b9d8..ce32dea09bb0b5 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_mlir_dialect_library(MLIRTensorTransforms
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   ConcatOpPatterns.cpp
   EmptyOpPatterns.cpp
   ExtractSliceFromReshapeUtils.cpp
diff --git a/mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp
deleted file mode 100644
index ee99a99b561090..00000000000000
--- a/mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-//===- Bufferize.cpp - Bufferization for `vector` dialect ops -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements bufferization of `vector` dialect ops
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Vector/Transforms/Passes.h"
-
-namespace mlir {
-namespace vector {
-#define GEN_PASS_DEF_VECTORBUFFERIZE
-#include "mlir/Dialect/Vector/Transforms/Passes.h.inc"
-} // namespace vector
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-struct VectorBufferizePass
-    : public vector::impl::VectorBufferizeBase<VectorBufferizePass> {
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<vector::VectorDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                    tensor::TensorDialect, vector::VectorDialect>();
-    vector::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::vector::createVectorBufferizePass() {
-  return std::make_unique<VectorBufferizePass>();
-}
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index c4b6abd3e23615..4dbefdd376a8b9 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_mlir_dialect_library(MLIRVectorTransforms
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   LowerVectorBroadcast.cpp
   LowerVectorContract.cpp
   LowerVectorGather.cpp
diff --git a/mlir/test/Dialect/Arith/bufferize.mlir b/mlir/test/Dialect/Arith/bufferize.mlir
index 944954e9e4edd8..a3b1454fb68f66 100644
--- a/mlir/test/Dialect/Arith/bufferize.mlir
+++ b/mlir/test/Dialect/Arith/bufferize.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s -arith-bufferize -split-input-file -verify-diagnostics | FileCheck %s
-// RUN: mlir-opt %s -arith-bufferize=alignment=64 -split-input-file -verify-diagnostics | FileCheck --check-prefix=ALIGNED %s
+// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=arith,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -split-input-file -verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL:   func @index_cast(
 // CHECK-SAME:  %[[TENSOR:.*]]: tensor<i32>, %[[SCALAR:.*]]: i32
@@ -22,10 +21,7 @@ func.func @index_cast(%tensor: tensor<i32>, %scalar: i32) -> (tensor<index>, ind
 // The name isn't load-bearing though.
 
 // CHECK: memref.global "private" constant @__constant_3x4xf32 : memref<3x4xf32> = dense<7.000000e+00>
-// CHECK-NOT: alignment
-
-// ALIGNED: memref.global "private" constant @__constant_3x4xf32 : memref<3x4xf32> = dense<7.000000e+00>
-// ALIGNED-SAME: {alignment = 64 : i64}
+// CHECK-SAME: {alignment = 64 : i64}
 
 // CHECK: @basic
 func.func @basic() -> tensor<3x4xf32> {
diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
index 29f27e6838e661..e8ab1184b1fd26 100644
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -linalg-bufferize -canonicalize -cse -split-input-file %s | FileCheck %s
+// RUN: mlir-opt --one-shot-bufferize="dialect-filter=linalg,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -canonicalize -cse -split-input-file %s | FileCheck %s
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -189,31 +189,3 @@ func.func @bufferize_dot(%in: tensor<4xf32>, %out: tensor<f32>) -> tensor<f32> {
   // CHECK: %[[OUT_TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] : memref<f32>
   // CHECK: return %[[OUT_TENSOR]]
 }
-
-// -----
-
-// This is a regression test. The linalg-bufferize pass should ignore all func
-// dialect ops.
-
-// CHECK-LABEL: func private @csum(tensor<6xi64>) -> tensor<6xi64>
-func.func private @csum(%arg0: tensor<6xi64>) -> tensor<6xi64>
-
-// CHECK: func public @main(%[[arg0:.*]]: tensor<2x3xi1>)
-// CHECK:   %[[collapse:.*]] = tensor.collapse_shape %[[arg0]]
-// CHECK:   %[[collapse_m:.*]] = bufferization.to_memref %[[collapse]]
-// CHECK:   %[[alloc:.*]] = memref.alloc()
-// CHECK:   linalg.generic {{.*}} ins(%[[collapse_m]] : memref<6xi1>) outs(%[[alloc]] : memref<6xi64>)
-// CHECK:   %[[generic_t:.*]] = bufferization.to_tensor %[[alloc]]
-// CHECK:   %[[call:.*]] = call @csum(%[[generic_t]])
-// CHECK:   return %[[call]]
-func.func public @main(%arg0: tensor<2x3xi1>) -> tensor<6xi64> {
-  %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<2x3xi1> into tensor<6xi1>
-  %1 = tensor.empty() : tensor<6xi64>
-  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<6xi1>) outs(%1 : tensor<6xi64>) {
-  ^bb0(%arg1: i1, %arg2: i64):
-    %4 = arith.extui %arg1 : i1 to i64
-    linalg.yield %4 : i64
-  } -> tensor<6xi64>
-  %3 = func.call @csum(%2) : (tensor<6xi64>) -> tensor<6xi64>
-  return %3 : tensor<6xi64>
-}
diff --git a/mlir/test/Dialect/Shape/bufferize.mlir b/mlir/test/Dialect/Shape/bufferize.mlir
index 963a5e8bcf5787..9f30a052208f0b 100644
--- a/mlir/test/Dialect/Shape/bufferize.mlir
+++ b/mlir/test/Dialect/Shape/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -shape-bufferize <%s | FileCheck %s
+// RUN: mlir-opt -split-input-file --one-shot-bufferize="dialect-filter=shape,bufferization copy-before-write unknown-type-conversion=identity-layout-map allow-unknown-ops" <%s | FileCheck %s
 
 // -----
 
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
index 6112856fbf2931..c27df00785522a 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
@@ -4,8 +4,7 @@
 // RUN: FileCheck %s --check-prefix=CHECK-MIR
 //
 // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification --sparse-tensor-conversion --cse \
-// RUN: --func-bufferize --arith-bufferize           \
-// RUN: --tensor-bufferize --finalizing-bufferize |  \
+// RUN: --one-shot-bufferize="copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" | \
 // RUN: FileCheck %s --check-prefix=CHECK-LIR
 
 #CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
index 401da152a8bdb8..9fbb9dd0a26d17 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
@@ -4,8 +4,7 @@
 // RUN: FileCheck %s --check-prefix=CHECK-MIR
 //
 // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification --sparse-tensor-conversion --cse \
-// RUN: --func-bufferize --arith-bufferize           \
-// RUN: --tensor-bufferize --finalizing-bufferize |  \
+// RUN: --one-shot-bufferize="copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" | \
 // RUN: FileCheck %s --check-prefix=CHECK-LIR
 
 #CSC = #sparse_tensor.encoding<{
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
index d769876d8ee8e3..a827360abb4267 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
@@ -4,8 +4,7 @@
 // RUN: FileCheck %s --check-prefix=CHECK-MIR
 //
 // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification --sparse-tensor-conversion --cse \
-// RUN: --func-bufferize --arith-bufferize           \
-// RUN: --tensor-bufferize --finalizing-bufferize |  \
+// RUN: --one-shot-bufferize="copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" | \
 // RUN: FileCheck %s --check-prefix=CHECK-LIR
 
 #CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir
index 4f553adcc500fb..e85d9e740adf4e 100644
--- a/mlir/test/Dialect/Tensor/bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -tensor-bufferize -cse -split-input-file | FileCheck %s
+// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=tensor,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -cse -split-input-file | FileCheck %s
 
 // CHECK-LABEL:   func @dim(
 // CHECK-SAME:              %[[TENSOR:.*]]: tensor<*xf32>,
diff --git a/mlir/test/Dialect/Vector/bufferize-invalid.mlir b/mlir/test/Dialect/Vector/bufferize-invalid.mlir
index 1ae3e312c868f7..bcca50a0fe79a6 100644
--- a/mlir/test/Dialect/Vector/bufferize-invalid.mlir
+++ b/mlir/test/Dialect/Vector/bufferize-invalid.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s -vector-bufferize -split-input-file -verify-diagnostics
-// | FileCheck %s
+// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=vector,bufferization copy-before-write unknown-type-conversion=identity-layout-map allow-unknown-ops" -split-input-file -verify-diagnostics
 
 // CHECK-LABEL: func @mask(
 func.func @mask(%t0: tensor<?xf32>, %val: vector<16xf32>, %idx: index, %m0: vector<16xi1>) -> tensor<?xf32> {
diff --git a/mlir/test/Dialect/Vector/bufferize.mlir b/mlir/test/Dialect/Vector/bufferize.mlir
index 6a6a8fa8938bc2..3399f60a2c3bf3 100644
--- a/mlir/test/Dialect/Vector/bufferize.mlir
+++ b/mlir/test/Dialect/Vector/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -vector-bufferize -split-input-file | FileCheck %s
+// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=vector,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -split-input-file | FileCheck %s
 
 // CHECK-LABEL: func @transfer_read(
 //  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[o1:.*]]: index, %[[o2:.*]]: index, %[[pad:.*]]: f32)
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index b0e414d157268b..5d27c3e290d50c 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:   -func-bufferize -tensor-bufferize -arith-bufferize --canonicalize \
+// RUN:   -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \
 // RUN:   -convert-scf-to-cf --convert-complex-to-standard \
 // RUN:   -finalize-memref-to-llvm -convert-math-to-llvm -convert-math-to-libm \
 // RUN:   -convert-vector-to-llvm -convert-complex-to-llvm \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir
index 43e423d4c3e8e1..734e09b7ed103d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -linalg-bufferize \
-// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-scf-to-cf -expand-strided-metadata -lower-affine -convert-cf-to-llvm -convert-arith-to-llvm \
 // RUN: -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_runner_utils \
+// RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
 // RUN: | FileCheck %s
 
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir
index 84dad567ced3ff..a323b0d9f876cf 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -convert-elementwise-to-linalg \
-// RUN: -arith-bufferize -linalg-bufferize -tensor-bufferize -func-bufferize \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -canonicalize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops \
 // RUN: -convert-scf-to-cf -convert-arith-to-llvm -convert-cf-to-llvm --finalize-memref-to-llvm \
 // RUN: -convert-func-to-llvm -reconcile-unrealized-casts | \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir
index db882f7a54d392..45283e173c9f02 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -linalg-bufferize \
-// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-scf-to-cf -expand-strided-metadata -lower-affine -convert-cf-to-llvm -convert-arith-to-llvm \
 // RUN: -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_runner_utils \
+// RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
 // RUN: | FileCheck %s
 
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
index 54a2bbf8d46809..23a07464bb5be9 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
@@ -1,6 +1,5 @@
 // RUN: mlir-opt %s -test-linalg-transform-patterns=test-linalg-to-vector-patterns \
-// RUN: -empty-tensor-to-alloc-tensor -linalg-bufferize -arith-bufferize \
-// RUN: -bufferization-bufferize -tensor-bufferize -func-bufferize \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \
 // RUN: -lower-affine -convert-arith-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir
index 98fce6c020c03d..01a0ba26fd7cda 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -linalg-bufferize \
-// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata  \
 // RUN: -lower-affine -convert-arith-to-llvm --finalize-memref-to-llvm \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir
index cf7d0c762ea36f..73d4aff73fb7a4 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -linalg-bufferize \
-// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \
 // RUN: -lower-affine -convert-arith-to-llvm --finalize-memref-to-llvm \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir
index 38b49cd444df3c..ff9ddedf91e177 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt %s -arith-bufferize -linalg-bufferize \
-// RUN: -tensor-bufferize -func-bufferize -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops \
 // RUN: -convert-arith-to-llvm -convert-scf-to-cf -convert-cf-to-llvm --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
index 41296cdfcb2d5a..698191577efe31 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
@@ -1,14 +1,14 @@
 // UNSUPPORTED: asan
-// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -linalg-bufferize -arith-bufferize \
-// RUN: -tensor-bufferize -func-bufferize -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops -convert-scf-to-cf \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops -convert-scf-to-cf \
 // RUN: -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils \
 // RUN: | FileCheck %s
 
-// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -linalg-bufferize \
-// RUN: -scf-bufferize -arith-bufferize -tensor-bufferize \
-// RUN: -func-bufferize \
+// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -convert-linalg-to-loops -convert-scf-to-cf -convert-scf-to-cf \
 // RUN:  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Memref/print-memref.mlir b/mlir/test/Integration/Dialect/Memref/print-memref.mlir
index b83f3919efd83e..f59e220d7461e6 100644
--- a/mlir/test/Integration/Dialect/Memref/print-memref.mlir
+++ b/mlir/test/Integration/Dialect/Memref/print-memref.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:   -func-bufferize -arith-bufferize --canonicalize \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \
 // RUN:   -finalize-memref-to-llvm\
 // RUN:   -convert-func-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner \
diff --git a/mlir/test/Integration/Dialect/Memref/verify-memref.mlir b/mlir/test/Integration/Dialect/Memref/verify-memref.mlir
index b7e2a46688f475..431ae0a89d20c3 100644
--- a/mlir/test/Integration/Dialect/Memref/verify-memref.mlir
+++ b/mlir/test/Integration/Dialect/Memref/verify-memref.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:   -func-bufferize -arith-bufferize --canonicalize \
+// RUN: -func-bufferize -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \
 // RUN:   -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm\
 // RUN:   -convert-func-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir
index faa129efa63a91..a7c5b91273423b 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-cf \
-// RUN:  -arith-bufferize -convert-vector-to-llvm="enable-amx" \
+// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN: -convert-scf-to-cf -convert-vector-to-llvm="enable-amx" \
 // RUN:  -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-translate -mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry --mattr="+amx-tile,+amx-int8,+amx-bf16" \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir
index 3ed28fc68acb8f..7b7ee54db8c348 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir
@@ -1,5 +1,7 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-cf \
-// RUN:  -arith-bufferize -convert-vector-to-llvm="enable-amx" \
+// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN: -convert-scf-to-cf \
+// RUN:  -convert-vector-to-llvm="enable-amx" \
 // RUN:  -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-translate -mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry --mattr="+amx-tile,+amx-int8,+amx-bf16" \

From a4bef0ca826a8145ef3cb288846017c034a817c2 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Tue, 28 May 2024 12:15:50 -0700
Subject: [PATCH 50/89] [libc++] Mark P2845R8 `__cpp_lib_format_path` and
 P2587R3 `__cpp_lib_to_string` as C++26 (#93255)

[P2845R8](https://wg21.link/P2845R8) "Formatting of
`std::filesystem::path`" and [P2587R3](https://wg21.link/P2587R3)
"`to_string` or not `to_string`" are C++26 features, so they should be
marked accordingly in `generate_feature_test_macro_components.py`.

I verified that without my changes, running the script produced no
edits. Then with my changes, I ran the script to regenerate all files,
with no other manual edits.

Found while running libc++'s tests with MSVC's STL, which noticed this
because it's currently a C++23-only implementation.

Note that @H-G-Hristov has a draft implementation of P2587R3: #78100
---
 libcxx/docs/FeatureTestMacroTable.rst         |  8 ++--
 libcxx/include/version                        |  4 +-
 .../filesystem.version.compile.pass.cpp       | 23 +++-------
 .../string.version.compile.pass.cpp           | 23 +++-------
 .../version.version.compile.pass.cpp          | 46 ++++++-------------
 .../generate_feature_test_macro_components.py |  4 +-
 6 files changed, 36 insertions(+), 72 deletions(-)

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 17d2da907692e8..0297068785e8b8 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -326,8 +326,6 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_expected``                                     ``202211L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_format_path``                                  *unimplemented*
-    ---------------------------------------------------------- -----------------
     ``__cpp_lib_format_ranges``                                ``202207L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_formatters``                                   *unimplemented*
@@ -386,8 +384,6 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_string_resize_and_overwrite``                  ``202110L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_to_string``                                    *unimplemented*
-    ---------------------------------------------------------- -----------------
     ``__cpp_lib_to_underlying``                                ``202102L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_tuple_like``                                   *unimplemented*
@@ -412,6 +408,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_default_template_type_for_algorithm_values``   *unimplemented*
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_format_path``                                  *unimplemented*
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_freestanding_algorithm``                       *unimplemented*
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_freestanding_array``                           *unimplemented*
@@ -466,6 +464,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_to_chars``                                     *unimplemented*
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_to_string``                                    *unimplemented*
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_tuple_like``                                   *unimplemented*
     ========================================================== =================
 
diff --git a/libcxx/include/version b/libcxx/include/version
index 69556d731f1cfc..140a9a0d870360 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -459,7 +459,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_constexpr_typeinfo                   202106L
 # define __cpp_lib_containers_ranges                    202202L
 # define __cpp_lib_expected                             202211L
-// # define __cpp_lib_format_path                          202403L
 # define __cpp_lib_format_ranges                        202207L
 // # define __cpp_lib_formatters                           202302L
 # define __cpp_lib_forward_like                         202207L
@@ -490,7 +489,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_stdatomic_h                          202011L
 # define __cpp_lib_string_contains                      202011L
 # define __cpp_lib_string_resize_and_overwrite          202110L
-// # define __cpp_lib_to_string                            202306L
 # define __cpp_lib_to_underlying                        202102L
 // # define __cpp_lib_tuple_like                           202207L
 # define __cpp_lib_unreachable                          202202L
@@ -506,6 +504,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cpp_lib_copyable_function                    202306L
 // # define __cpp_lib_debugging                            202311L
 // # define __cpp_lib_default_template_type_for_algorithm_values 202403L
+// # define __cpp_lib_format_path                          202403L
 // # define __cpp_lib_freestanding_algorithm               202311L
 // # define __cpp_lib_freestanding_array                   202311L
 // # define __cpp_lib_freestanding_cstring                 202306L
@@ -537,6 +536,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cpp_lib_text_encoding                        202306L
 # undef  __cpp_lib_to_chars
 // # define __cpp_lib_to_chars                             202306L
+// # define __cpp_lib_to_string                            202306L
 # undef  __cpp_lib_tuple_like
 // # define __cpp_lib_tuple_like                           202311L
 #endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
index 308cc2d43b0586..4aba33482f69c4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
@@ -20,7 +20,7 @@
 /*  Constant                 Value
     __cpp_lib_char8_t        201907L [C++20]
     __cpp_lib_filesystem     201703L [C++17]
-    __cpp_lib_format_path    202403L [C++23]
+    __cpp_lib_format_path    202403L [C++26]
 */
 
 #include <filesystem>
@@ -37,7 +37,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 14
@@ -51,7 +51,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 17
@@ -74,7 +74,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 20
@@ -106,7 +106,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -137,17 +137,8 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should be defined in c++23"
-#   endif
-#   if __cpp_lib_format_path != 202403L
-#     error "__cpp_lib_format_path should have the value 202403L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_format_path
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
index 16a9a0a28de635..af6386a40a458a 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
@@ -29,7 +29,7 @@
     __cpp_lib_string_udls                                   201304L [C++14]
     __cpp_lib_string_view                                   201606L [C++17]
                                                             201803L [C++20]
-    __cpp_lib_to_string                                     202306L [C++23]
+    __cpp_lib_to_string                                     202306L [C++26]
 */
 
 #include <string>
@@ -86,7 +86,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 14
@@ -143,7 +143,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 17
@@ -209,7 +209,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 20
@@ -293,7 +293,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -385,17 +385,8 @@
 #   error "__cpp_lib_string_view should have the value 201803L in c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should be defined in c++23"
-#   endif
-#   if __cpp_lib_to_string != 202306L
-#     error "__cpp_lib_to_string should have the value 202306L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_to_string
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 7829e06f90760b..c1e1f9f340af48 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -88,7 +88,7 @@
     __cpp_lib_expected                                      202211L [C++23]
     __cpp_lib_filesystem                                    201703L [C++17]
     __cpp_lib_format                                        202106L [C++20]
-    __cpp_lib_format_path                                   202403L [C++23]
+    __cpp_lib_format_path                                   202403L [C++26]
     __cpp_lib_format_ranges                                 202207L [C++23]
     __cpp_lib_format_uchar                                  202311L [C++20]
     __cpp_lib_formatters                                    202302L [C++23]
@@ -216,7 +216,7 @@
     __cpp_lib_to_array                                      201907L [C++20]
     __cpp_lib_to_chars                                      201611L [C++17]
                                                             202306L [C++26]
-    __cpp_lib_to_string                                     202306L [C++23]
+    __cpp_lib_to_string                                     202306L [C++26]
     __cpp_lib_to_underlying                                 202102L [C++23]
     __cpp_lib_transformation_trait_aliases                  201304L [C++14]
     __cpp_lib_transparent_operators                         201210L [C++14]
@@ -513,7 +513,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -1005,7 +1005,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -1348,7 +1348,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -1891,7 +1891,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -2303,7 +2303,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -2972,7 +2972,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -3543,7 +3543,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -4350,7 +4350,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -4971,17 +4971,8 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should be defined in c++23"
-#   endif
-#   if __cpp_lib_format_path != 202403L
-#     error "__cpp_lib_format_path should have the value 202403L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_format_path
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifndef __cpp_lib_format_ranges
@@ -5943,17 +5934,8 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should be defined in c++23"
-#   endif
-#   if __cpp_lib_to_string != 202306L
-#     error "__cpp_lib_to_string should have the value 202306L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_to_string
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifndef __cpp_lib_to_underlying
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index b04cb4f5115547..1e79f6c140758c 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -515,7 +515,7 @@ def add_version_header(tc):
         },
         {
             "name": "__cpp_lib_format_path",
-            "values": {"c++23": 202403}, # P2845R8: Formatting of std::filesystem::path
+            "values": {"c++26": 202403},  # P2845R8: Formatting of std::filesystem::path
             "headers": ["filesystem"],
             "unimplemented": True,
         },
@@ -1270,7 +1270,7 @@ def add_version_header(tc):
         },
         {
             "name": "__cpp_lib_to_string",
-            "values": {"c++23": 202306},  # P2587R3 to_string or not to_string
+            "values": {"c++26": 202306},  # P2587R3 to_string or not to_string
             "headers": ["string"],
             "unimplemented": True,
         },

From 51752ed0dd737f12014a89dec67d25494083153d Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Tue, 28 May 2024 21:17:31 +0200
Subject: [PATCH 51/89] [mlir][nvgpu] verify the module

---
 mlir/test/Examples/NVGPU/tools/nvdsl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Examples/NVGPU/tools/nvdsl.py b/mlir/test/Examples/NVGPU/tools/nvdsl.py
index 600cae5b47eeec..90dbb2355e1c87 100644
--- a/mlir/test/Examples/NVGPU/tools/nvdsl.py
+++ b/mlir/test/Examples/NVGPU/tools/nvdsl.py
@@ -431,7 +431,7 @@ def __str__(self):
                 # saveIR(module)
 
                 # Verify the module
-                # module.operation.verify()
+                module.operation.verify()
 
                 # Compile and JIT MLIR module
                 options = f"cubin-chip=sm_90a cubin-features=+ptx80 opt-level=3"

From 266fac8375bdf3f039503c559bb16ffab8895ae5 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Tue, 28 May 2024 12:17:57 -0700
Subject: [PATCH 52/89] [libc++] [test] Fix MSVC warnings (#93257)

Found while running libc++'s tests with MSVC's STL.

* Avoid MSVC warning C5101: use of preprocessor directive in
function-like macro argument list is undefined behavior.
+ We can easily make this portable by extracting `const bool is_newlib`.
  + Followup to #73440.
  + See #73598.
  + See #73836.
* Avoid MSVC warning C4267: 'return': conversion from 'size_t' to 'int',
possible loss of data.
+ This warning is valid, but harmless for the test, so
`static_cast<int>` will avoid it.
* Avoid MSVC warning C4146: unary minus operator applied to unsigned
type, result still unsigned.
+ This warning is also valid (the scenario is sometimes intentional, but
surprising enough that it's worth warning about). This is a C++17 test,
so we can easily avoid it by testing `is_signed_v` at compile-time
before testing `m < 0` and `n < 0` at run-time.
* Silence MSVC warning C4310: cast truncates constant value.
+ These warnings are being emitted by `T(255)`. Disabling the warning is
simpler than attempting to restructure the code.
  + Followup to #79791.
* MSVC no longer emits warning C4521: multiple copy constructors
specified.
+ This warning was removed from the compiler, since at least 2021-12-09.
---
 .../atomics.ref/compare_exchange_strong.pass.cpp      |  3 +++
 .../atomics.ref/compare_exchange_weak.pass.cpp        |  3 +++
 libcxx/test/std/atomics/atomics.ref/wait.pass.cpp     |  3 +++
 .../views.span/span.cons/initializer_list.pass.cpp    |  4 ++--
 .../syserr.errcat.objects/generic_category.pass.cpp   | 11 +++++++----
 .../syserr.errcat.objects/system_category.pass.cpp    | 11 +++++++----
 .../numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp | 10 ++++++----
 libcxx/test/support/msvc_stdlib_force_include.h       |  1 -
 8 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
index 72b2f444c476c7..90aa5ea5b6df45 100644
--- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
@@ -9,6 +9,9 @@
 // XFAIL: !has-64-bit-atomics
 // XFAIL: !has-1024-bit-atomics
 
+// MSVC warning C4310: cast truncates constant value
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310
+
 // bool compare_exchange_strong(T&, T, memory_order, memory_order) const noexcept;
 // bool compare_exchange_strong(T&, T, memory_order = memory_order::seq_cst) const noexcept;
 
diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
index 5219a8e3714f98..99c1385a2fe0b7 100644
--- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
@@ -9,6 +9,9 @@
 // XFAIL: !has-64-bit-atomics
 // XFAIL: !has-1024-bit-atomics
 
+// MSVC warning C4310: cast truncates constant value
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310
+
 // bool compare_exchange_weak(T&, T, memory_order, memory_order) const noexcept;
 // bool compare_exchange_weak(T&, T, memory_order = memory_order::seq_cst) const noexcept;
 
diff --git a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
index e5310febf5c5eb..f246803ba25925 100644
--- a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
@@ -11,6 +11,9 @@
 // XFAIL: !has-64-bit-atomics
 // XFAIL: !has-1024-bit-atomics
 
+// MSVC warning C4310: cast truncates constant value
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310
+
 // void wait(T, memory_order = memory_order::seq_cst) const noexcept;
 
 #include <atomic>
diff --git a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
index 74a5094f61261d..bc76e23fea3c03 100644
--- a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
@@ -93,9 +93,9 @@ constexpr bool test() {
 
 // Test P2447R4 "Annex C examples"
 
-constexpr int three(std::span<void* const> sp) { return sp.size(); }
+constexpr int three(std::span<void* const> sp) { return static_cast<int>(sp.size()); }
 
-constexpr int four(std::span<const std::any> sp) { return sp.size(); }
+constexpr int four(std::span<const std::any> sp) { return static_cast<int>(sp.size()); }
 
 bool test_P2447R4_annex_c_examples() {
   // 1. Overload resolution is affected
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
index d4bbde75ae8821..7283fdc769d86b 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
@@ -50,13 +50,16 @@ int main(int, char**)
         // responds with an empty message, which we probably want to
         // treat as a failure code otherwise, but we can detect that
         // with the preprocessor.
+#if defined(_NEWLIB_VERSION)
+        const bool is_newlib = true;
+#else
+        const bool is_newlib = false;
+#endif
+        (void)is_newlib;
         LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0       // AIX
                       || msg.rfind("No error information", 0) == 0 // Musl
                       || msg.rfind("Unknown error", 0) == 0        // Glibc
-#if defined(_NEWLIB_VERSION)
-                      || msg.empty()
-#endif
-        );
+                      || (is_newlib && msg.empty()));
         assert(errno == E2BIG);
     }
 
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index eefbddd27a7f53..02a1baf5999831 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -56,13 +56,16 @@ int main(int, char**) {
     // responds with an empty message, which we probably want to
     // treat as a failure code otherwise, but we can detect that
     // with the preprocessor.
+#if defined(_NEWLIB_VERSION)
+    const bool is_newlib = true;
+#else
+    const bool is_newlib = false;
+#endif
+    (void)is_newlib;
     LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0       // AIX
                   || msg.rfind("No error information", 0) == 0 // Musl
                   || msg.rfind("Unknown error", 0) == 0        // Glibc
-#if defined(_NEWLIB_VERSION)
-                  || msg.empty()
-#endif
-    );
+                  || (is_newlib && msg.empty()));
     assert(errno == E2BIG);
   }
 
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
index 212804356a056d..bf40b174b209cc 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
@@ -57,10 +57,12 @@ T basic_gcd_(T m, T n) {
 template <typename T>
 T basic_gcd(T m, T n) {
   using Tp = std::make_unsigned_t<T>;
-  if (m < 0 && m != std::numeric_limits<T>::min())
-    m = -m;
-  if (n < 0 && n != std::numeric_limits<T>::min())
-    n = -n;
+  if constexpr (std::is_signed_v<T>) {
+    if (m < 0 && m != std::numeric_limits<T>::min())
+      m = -m;
+    if (n < 0 && n != std::numeric_limits<T>::min())
+      n = -n;
+  }
   return basic_gcd_(static_cast<Tp>(m), static_cast<Tp>(n));
 }
 
diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h
index 6c26085e72c45f..35783c1607b0e0 100644
--- a/libcxx/test/support/msvc_stdlib_force_include.h
+++ b/libcxx/test/support/msvc_stdlib_force_include.h
@@ -67,7 +67,6 @@ const AssertionDialogAvoider assertion_dialog_avoider{};
 // Silence compiler warnings.
 #  pragma warning(disable : 4180)  // qualifier applied to function type has no meaning; ignored
 #  pragma warning(disable : 4324)  // structure was padded due to alignment specifier
-#  pragma warning(disable : 4521)  // multiple copy constructors specified
 #  pragma warning(disable : 4702)  // unreachable code
 #  pragma warning(disable : 28251) // Inconsistent annotation for 'new': this instance has no annotations.
 #endif                             // !defined(__clang__)

From 2ba08386156ef25913b1bee170d8fe95aaceb234 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Tue, 28 May 2024 12:20:58 -0700
Subject: [PATCH 53/89] [libc++] [test] Fix portability issues for MSVC
 (#93259)

* Guard `std::__make_from_tuple_impl` tests with `#ifdef _LIBCPP_VERSION` and `LIBCPP_STATIC_ASSERT`.
* Change `_LIBCPP_CONSTEXPR_SINCE_CXX20` to `TEST_CONSTEXPR_CXX20`.
+ Other functions in `variant.swap/swap.pass.cpp` were already using the proper test macro.
* Mark `what` as `[[maybe_unused]]` when used by `TEST_LIBCPP_REQUIRE`.
  + This updates one occurrence in `libcxx/test/libcxx` for consistency.
* Windows `_putenv_s()` takes 2 arguments, not 3.
  + See MSVC documentation: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/putenv-s-wputenv-s?view=msvc-170
+ POSIX `setenv()` takes `int overwrite`, but Windows `_putenv_s()` always overwrites.
* Avoid non-Standard zero-length arrays.
  + Followup to #74183 and #79792.
* Add `operator++()` to `unsized_it`.
+ The Standard requires this due to [N4981][] [move.iter.requirements]/1 "The template parameter `Iterator` shall
  either meet the *Cpp17InputIterator* requirements ([input.iterators])
  or model `input_iterator` ([iterator.concept.input])."
+ MSVC's STL requires this because it has a strengthened exception
  specification in `move_iterator` that inspects the underlying iterator's
  increment operator.
* `uniform_int_distribution` forbids `int8_t`/`uint8_t`.
  + See [N4981][] [rand.req.genl]/1.5. MSVC's STL enforces this.
+ Note that when changing the distribution's `IntType`, we need to be
  careful to preserve the original value range of `[0, max_input]`.
* fstreams are constructible from `const fs::path::value_type*` on wide systems.
  + See [ifstream.cons], [ofstream.cons], [fstream.cons].
* In `msvc_stdlib_force_include.h`, map `_HAS_CXX23` to `TEST_STD_VER` 23 instead of 99.
+ On 2023-05-23, https://github.com/llvm/llvm-project/commit/71400505ca048507e827013eb1ea0bc863525cab
  started recognizing 23 as a distinct value.
* Fix test name typo: `destory_elements.pass.cpp` => `destroy_elements.pass.cpp`

[N4981]: https://wg21.link/N4981
---
 .../time.zone.db.tzdb/locate_zone.pass.cpp    |  2 +-
 .../ranges.contains_subrange.pass.cpp         | 25 +++++++++--------
 ...nts.pass.cpp => destroy_elements.pass.cpp} |  0
 .../fstreams/fstream.cons/path.pass.cpp       |  2 +-
 .../fstreams/ifstream.cons/path.pass.cpp      |  2 +-
 .../fstreams/ofstream.cons/path.pass.cpp      |  2 +-
 .../sized_sentinel.compile.pass.cpp           |  1 +
 .../numeric.ops/numeric.ops.gcd/gcd.pass.cpp  |  9 ++++--
 .../time.zone.db.access/current_zone.pass.cpp |  2 +-
 .../time.zone.db.access/locate_zone.pass.cpp  |  2 +-
 .../time.zone.db.tzdb/current_zone.pass.cpp   |  2 +-
 .../time.zone.db.tzdb/locate_zone.pass.cpp    |  2 +-
 .../tuple.apply/make_from_tuple.pass.cpp      | 28 ++++++++++---------
 .../variant.swap/swap.pass.cpp                |  2 +-
 .../test/support/msvc_stdlib_force_include.h  |  2 +-
 15 files changed, 45 insertions(+), 38 deletions(-)
 rename libcxx/test/std/containers/sequences/vector/vector.modifiers/{destory_elements.pass.cpp => destroy_elements.pass.cpp} (100%)

diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
index 3ee213358f3524..08c682964c3745 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -73,7 +73,7 @@ L link link_to_link
   TEST_VALIDATE_EXCEPTION(
       std::runtime_error,
       [&]([[maybe_unused]] const std::runtime_error& e) {
-        std::string_view what{"tzdb: requested time zone not found"};
+        [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"};
         TEST_LIBCPP_REQUIRE(
             e.what() == what,
             TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
index 761691c2afdcb9..890ac23fff8327 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
@@ -24,6 +24,7 @@
 //                                              Proj1 proj1 = {}, Proj2 proj2 = {});                 // since C++23
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <concepts>
 #include <ranges>
@@ -130,10 +131,10 @@ constexpr void test_iterators() {
   }
 
   { // range has zero length
-    int a[]       = {};
-    int p[]       = {3, 4, 2};
-    auto whole    = std::ranges::subrange(Iter1(a), Sent1(Iter1(a)));
-    auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p))));
+    std::array<int, 0> a = {};
+    int p[]              = {3, 4, 2};
+    auto whole           = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data())));
+    auto subrange        = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p))));
     {
       bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end());
       assert(!ret);
@@ -145,10 +146,10 @@ constexpr void test_iterators() {
   }
 
   { // subrange has zero length
-    int a[]       = {3, 4, 2};
-    int p[]       = {};
-    auto whole    = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a))));
-    auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p)));
+    int a[]              = {3, 4, 2};
+    std::array<int, 0> p = {};
+    auto whole           = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a))));
+    auto subrange        = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data())));
     {
       bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end());
       assert(ret);
@@ -160,10 +161,10 @@ constexpr void test_iterators() {
   }
 
   { // range and subrange both have zero length
-    int a[]       = {};
-    int p[]       = {};
-    auto whole    = std::ranges::subrange(Iter1(a), Sent1(Iter1(a)));
-    auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p)));
+    std::array<int, 0> a = {};
+    std::array<int, 0> p = {};
+    auto whole           = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data())));
+    auto subrange        = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data())));
     {
       bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end());
       assert(ret);
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp
similarity index 100%
rename from libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp
rename to libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp
index 5edf22eaacf31f..d6bb56d9b78b79 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp
@@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() {
   static_assert(!std::is_constructible_v<std::fstream, const std::basic_string_view<CharT>>);
 
   // Char* pointers
-  if constexpr (!std::is_same_v<CharT, char>)
+  if constexpr (!std::is_same_v<CharT, char> && !std::is_same_v<CharT, fs::path::value_type>)
     static_assert(!std::is_constructible_v<std::fstream, const CharT*>);
 
   // Iterators
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp
index 2f27fd8e6e93d3..792b65615679a7 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp
@@ -38,7 +38,7 @@ constexpr bool test_non_convert_to_path() {
   static_assert(!std::is_constructible_v<std::ifstream, const std::basic_string_view<CharT>>);
 
   // Char* pointers
-  if constexpr (!std::is_same_v<CharT, char>)
+  if constexpr (!std::is_same_v<CharT, char> && !std::is_same_v<CharT, fs::path::value_type>)
     static_assert(!std::is_constructible_v<std::ifstream, const CharT*>);
 
   // Iterators
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp
index e55adfd83fc3c7..602bdadd85813f 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp
@@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() {
   static_assert(!std::is_constructible_v<std::ofstream, const std::basic_string_view<CharT>>);
 
   // Char* pointers
-  if constexpr (!std::is_same_v<CharT, char>)
+  if constexpr (!std::is_same_v<CharT, char> && !std::is_same_v<CharT, fs::path::value_type>)
     static_assert(!std::is_constructible_v<std::ofstream, const CharT*>);
 
   // Iterators
diff --git a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp
index cb49086dd6802b..998b13ed494552 100644
--- a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp
@@ -21,6 +21,7 @@ struct unsized_it {
   using difference_type = std::ptrdiff_t;
 
   value_type& operator*() const;
+  unsized_it& operator++();
   bool operator==(const unsized_it&) const;
   difference_type operator-(const unsized_it&) const { return 0; }
 };
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
index bf40b174b209cc..6a9ec1a2ffec24 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <climits>
 #include <cstdint>
+#include <limits>
 #include <random>
 #include <type_traits>
 
@@ -69,12 +70,14 @@ T basic_gcd(T m, T n) {
 template <typename Input>
 void do_fuzzy_tests() {
   std::mt19937 gen(1938);
-  std::uniform_int_distribution<Input> distrib;
+  using DistIntType         = std::conditional_t<sizeof(Input) == 1, int, Input>; // See N4981 [rand.req.genl]/1.5
+  constexpr Input max_input = std::numeric_limits<Input>::max();
+  std::uniform_int_distribution<DistIntType> distrib(0, max_input);
 
   constexpr int nb_rounds = 10000;
   for (int i = 0; i < nb_rounds; ++i) {
-    Input n = distrib(gen);
-    Input m = distrib(gen);
+    Input n = static_cast<Input>(distrib(gen));
+    Input m = static_cast<Input>(distrib(gen));
     assert(std::gcd(n, m) == basic_gcd(n, m));
   }
 }
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
index 2c43e121613c77..f31a679dd6214f 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
@@ -32,7 +32,7 @@ static void set_tz(std::string zone) {
   // Unlike POSIX it does not mention the string of putenv becomes part
   // of the environment.
 
-  int status = _putenv_s("TZ", zone.c_str(), 1);
+  int status = _putenv_s("TZ", zone.c_str());
   assert(status == 0);
 }
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
index 4d600fcdf40e3f..8dd895fd21814f 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
@@ -40,7 +40,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) {
   TEST_VALIDATE_EXCEPTION(
       std::runtime_error,
       [&]([[maybe_unused]] const std::runtime_error& e) {
-        std::string_view what{"tzdb: requested time zone not found"};
+        [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"};
         TEST_LIBCPP_REQUIRE(
             e.what() == what,
             TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
index e6497e26323ce6..98509c298ebcb8 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
@@ -34,7 +34,7 @@ static void set_tz(std::string zone) {
   // Unlike POSIX it does not mention the string of putenv becomes part
   // of the environment.
 
-  int status = _putenv_s("TZ", zone.c_str(), 1);
+  int status = _putenv_s("TZ", zone.c_str());
   assert(status == 0);
 }
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
index f929dafcc96838..08ce48dfd0edb2 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -42,7 +42,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) {
   TEST_VALIDATE_EXCEPTION(
       std::runtime_error,
       [&]([[maybe_unused]] const std::runtime_error& e) {
-        std::string_view what{"tzdb: requested time zone not found"};
+        [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"};
         TEST_LIBCPP_REQUIRE(
             e.what() == what,
             TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
index d7374351afa8bf..accb601dd00365 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
@@ -209,6 +209,7 @@ template <class T, class Tuple>
 static constexpr bool can_make_from_tuple =
     std::is_same_v<decltype(test_make_from_tuple<T, Tuple>(T{}, Tuple{})), uint8_t>;
 
+#ifdef _LIBCPP_VERSION
 template <class T, class Tuple>
 auto test_make_from_tuple_impl(T&&, Tuple&& t)
     -> decltype(std::__make_from_tuple_impl<T>(
@@ -224,6 +225,7 @@ uint32_t test_make_from_tuple_impl(...) {
 template <class T, class Tuple>
 static constexpr bool can_make_from_tuple_impl =
     std::is_same_v<decltype(test_make_from_tuple_impl<T, Tuple>(T{}, Tuple{})), uint8_t>;
+#endif // _LIBCPP_VERSION
 
 struct A {
   int a;
@@ -263,23 +265,23 @@ static_assert(can_make_from_tuple<float, std::tuple<double>>);
 // Test std::__make_from_tuple_impl constraints.
 
 // reinterpret_cast
-static_assert(!can_make_from_tuple_impl<int*, std::tuple<A*>>);
-static_assert(can_make_from_tuple_impl<A*, std::tuple<A*>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<int*, std::tuple<A*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<A*, std::tuple<A*>>);
 
 // const_cast
-static_assert(!can_make_from_tuple_impl<char*, std::tuple<const char*>>);
-static_assert(!can_make_from_tuple_impl<volatile char*, std::tuple<const volatile char*>>);
-static_assert(can_make_from_tuple_impl<volatile char*, std::tuple<volatile char*>>);
-static_assert(can_make_from_tuple_impl<char*, std::tuple<char*>>);
-static_assert(can_make_from_tuple_impl<const char*, std::tuple<char*>>);
-static_assert(can_make_from_tuple_impl<const volatile char*, std::tuple<volatile char*>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<char*, std::tuple<const char*>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<volatile char*, std::tuple<const volatile char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<volatile char*, std::tuple<volatile char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<char*, std::tuple<char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<const char*, std::tuple<char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<const volatile char*, std::tuple<volatile char*>>);
 
 // static_cast
-static_assert(!can_make_from_tuple_impl<int, std::tuple<D>>);
-static_assert(!can_make_from_tuple_impl<D, std::tuple<int>>);
-static_assert(can_make_from_tuple_impl<long, std::tuple<int>>);
-static_assert(can_make_from_tuple_impl<double, std::tuple<float>>);
-static_assert(can_make_from_tuple_impl<float, std::tuple<double>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<int, std::tuple<D>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<D, std::tuple<int>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<long, std::tuple<int>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<double, std::tuple<float>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<float, std::tuple<double>>);
 
 } // namespace LWG3528
 
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
index db05691c55818c..039a2373348c4e 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
@@ -516,7 +516,7 @@ constexpr void test_swap_sfinae() {
   }
 }
 
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void test_swap_noexcept() {
+TEST_CONSTEXPR_CXX20 void test_swap_noexcept() {
   {
     using V = std::variant<int, NothrowMoveable>;
     static_assert(std::is_swappable_v<V> && has_swap_member<V>(), "");
diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h
index 35783c1607b0e0..785670224c3b18 100644
--- a/libcxx/test/support/msvc_stdlib_force_include.h
+++ b/libcxx/test/support/msvc_stdlib_force_include.h
@@ -90,7 +90,7 @@ const AssertionDialogAvoider assertion_dialog_avoider{};
 #include <version>
 
 #if _HAS_CXX23
-#  define TEST_STD_VER 99
+#  define TEST_STD_VER 23
 #elif _HAS_CXX20
 #  define TEST_STD_VER 20
 #elif _HAS_CXX17

From bc247ba113543b07fcff769ab616cf9509eb2794 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 28 May 2024 12:42:31 -0700
Subject: [PATCH 54/89] [memprof] Rename memprof-merge-v0.test to
 memprof-merge-versions.test (#93602)

Despite the name, the test is used to test merge/show roundtrips for
different MemProf versions.  This patch renames the test to match the
reality.
---
 .../{memprof-merge-v0.test => memprof-merge-versions.test}        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/tools/llvm-profdata/{memprof-merge-v0.test => memprof-merge-versions.test} (100%)

diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-v0.test b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
similarity index 100%
rename from llvm/test/tools/llvm-profdata/memprof-merge-v0.test
rename to llvm/test/tools/llvm-profdata/memprof-merge-versions.test

From 1c3a3f0e79a9c6a7c1c4a71c43a9eab783c3b266 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 12:49:42 -0700
Subject: [PATCH 55/89] [LegalizeTypes] Use VP_AND and VP_SHL/VP_SRA to promote
 operands fo VP arithmetic. (#92799)

This adds VPSExtPromotedInteger and VPZExtPromotedInteger and uses them
to promote many arithmetic operations.

VPSExtPromotedInteger uses a shift pair because we don't have
VP_SIGN_EXTEND_INREG yet.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 113 ++++++++++++------
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  21 ++++
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        |  12 +-
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       |   6 +-
 .../RISCV/rvv/fixed-vectors-vdiv-vp.ll        |   9 +-
 .../RISCV/rvv/fixed-vectors-vdivu-vp.ll       |   5 +-
 .../RISCV/rvv/fixed-vectors-vmax-vp.ll        |   9 +-
 .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll       |   5 +-
 .../RISCV/rvv/fixed-vectors-vmin-vp.ll        |   9 +-
 .../RISCV/rvv/fixed-vectors-vminu-vp.ll       |   5 +-
 .../RISCV/rvv/fixed-vectors-vrem-vp.ll        |   9 +-
 .../RISCV/rvv/fixed-vectors-vremu-vp.ll       |   5 +-
 .../RISCV/rvv/fixed-vectors-vshl-vp.ll        |   3 +-
 .../RISCV/rvv/fixed-vectors-vsra-vp.ll        |   7 +-
 .../RISCV/rvv/fixed-vectors-vsrl-vp.ll        |   5 +-
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll   |  40 +++----
 llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll        |  10 +-
 llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll       |   5 +-
 llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll        |  10 +-
 llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll       |   5 +-
 llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll        |  10 +-
 llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll       |   5 +-
 llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll        |  10 +-
 llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll       |   5 +-
 llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll        |   2 +-
 llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll        |   7 +-
 llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll        |   5 +-
 27 files changed, 201 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8fda35f0086329..12f1d005249d60 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -646,18 +646,21 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
     }
   }
 
-  // Zero extend to the promoted type and do the count there.
-  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-
   // Subtract off the extra leading bits in the bigger type.
   SDValue ExtractLeadingBits = DAG.getConstant(
       NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT);
-  if (!N->isVPOpcode())
+  if (!N->isVPOpcode()) {
+    // Zero extend to the promoted type and do the count there.
+    SDValue Op = ZExtPromotedInteger(N->getOperand(0));
     return DAG.getNode(ISD::SUB, dl, NVT,
                        DAG.getNode(N->getOpcode(), dl, NVT, Op),
                        ExtractLeadingBits);
+  }
+
   SDValue Mask = N->getOperand(1);
   SDValue EVL = N->getOperand(2);
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
   return DAG.getNode(ISD::VP_SUB, dl, NVT,
                      DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
                      ExtractLeadingBits, Mask, EVL);
@@ -681,11 +684,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
   }
 
   // Zero extend to the promoted type and do the count or parity there.
-  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  if (!N->isVPOpcode())
+  if (!N->isVPOpcode()) {
+    SDValue Op = ZExtPromotedInteger(N->getOperand(0));
     return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op);
-  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op,
-                     N->getOperand(1), N->getOperand(2));
+  }
+
+  SDValue Mask = N->getOperand(1);
+  SDValue EVL = N->getOperand(2);
+  SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, Mask,
+                     EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
@@ -1335,12 +1343,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FFREXP(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_SHL)
+  if (N->getOpcode() != ISD::VP_SHL) {
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+      RHS = ZExtPromotedInteger(RHS);
+
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
+
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = VPZExtPromotedInteger(RHS, Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
@@ -1364,27 +1379,39 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) {
-  // Sign extend the input.
-  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
-  if (N->getNumOperands() == 2)
+  if (N->getNumOperands() == 2) {
+    // Sign extend the input.
+    SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+    SDValue RHS = SExtPromotedInteger(N->getOperand(1));
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
   assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
   assert(N->isVPOpcode() && "Expected VP opcode");
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  // Sign extend the input.
+  SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  SDValue RHS = VPSExtPromotedInteger(N->getOperand(1), Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
-  // Zero extend the input.
-  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
-  if (N->getNumOperands() == 2)
+  if (N->getNumOperands() == 2) {
+    // Zero extend the input.
+    SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+    SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
   assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
   assert(N->isVPOpcode() && "Expected VP opcode");
+  // Zero extend the input.
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  SDValue RHS = VPZExtPromotedInteger(N->getOperand(1), Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
@@ -1400,27 +1427,43 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
-  // The input value must be properly sign extended.
-  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_SRA)
+  if (N->getOpcode() != ISD::VP_SRA) {
+    // The input value must be properly sign extended.
+    SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+      RHS = ZExtPromotedInteger(RHS);
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
+
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  // The input value must be properly sign extended.
+  SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = VPZExtPromotedInteger(RHS, Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
-  // The input value must be properly zero extended.
-  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_SRL)
+  if (N->getOpcode() != ISD::VP_SRL) {
+    // The input value must be properly zero extended.
+    SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+      RHS = ZExtPromotedInteger(RHS);
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
+
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  // The input value must be properly zero extended.
+  SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = VPZExtPromotedInteger(RHS, Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
@@ -1487,7 +1530,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VPFunnelShift(SDNode *N) {
   SDValue Mask = N->getOperand(3);
   SDValue EVL = N->getOperand(4);
   if (getTypeAction(Amt.getValueType()) == TargetLowering::TypePromoteInteger)
-    Amt = ZExtPromotedInteger(Amt);
+    Amt = VPZExtPromotedInteger(Amt, Mask, EVL);
   EVT AmtVT = Amt.getValueType();
 
   SDLoc DL(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d925089d5689f1..ba3c7582d5a8a2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -275,6 +275,27 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
     return DAG.getZeroExtendInReg(Op, dl, OldVT);
   }
 
+  /// Get a promoted operand and zero extend it to the final size.
+  SDValue VPSExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    // FIXME: Add VP_SIGN_EXTEND_INREG.
+    EVT VT = Op.getValueType();
+    unsigned BitsDiff = VT.getScalarSizeInBits() - OldVT.getScalarSizeInBits();
+    SDValue ShiftCst = DAG.getShiftAmountConstant(BitsDiff, VT, dl);
+    SDValue Shl = DAG.getNode(ISD::VP_SHL, dl, VT, Op, ShiftCst, Mask, EVL);
+    return DAG.getNode(ISD::VP_SRA, dl, VT, Shl, ShiftCst, Mask, EVL);
+  }
+
+  /// Get a promoted operand and zero extend it to the final size.
+  SDValue VPZExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    return DAG.getVPZeroExtendInReg(Op, Mask, EVL, dl, OldVT);
+  }
+
   // Promote the given operand V (vector or scalar) according to N's specific
   // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns
   // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index fff280c005b542..df413b878172bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2574,9 +2574,8 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK-LABEL: vp_ctlz_nxv1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
@@ -2593,9 +2592,8 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    li a0, 7
 ; CHECK-ZVBB-NEXT:    vsub.vx v8, v8, a0, v0.t
@@ -2607,9 +2605,8 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
@@ -2624,9 +2621,8 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    li a0, 7
 ; CHECK-ZVBB-NEXT:    vsub.vx v8, v8, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index e3c53212e91b77..b5cafe410ae8d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -2549,9 +2549,8 @@ define <vscale x 1 x i9> @vp_ctpop_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i
 ; CHECK-LABEL: vp_ctpop_nxv1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -2576,9 +2575,8 @@ define <vscale x 1 x i9> @vp_ctpop_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i9> @llvm.vp.ctpop.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
index 29f8eaba900527..e3c7d02462cc7f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vdiv_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vdiv_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vdiv.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
index 3f8eb0ff276b7f..03bd85bf5e69e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vdivu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vdivu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.udiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index 9789afda9344ad..0b0d758ad8ded8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smax.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vmax_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmax.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.smax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index 36b0a4642b6169..98e630a0e59e5a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vmaxu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vmaxu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.umax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index adb0a30f34d35a..a6e3764b37550d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smin.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vmin_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmin.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.smin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index 671ce82d4ae795..c59b65edd1ec10 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vminu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vminu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.umin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
index 4bbbad5ed0e0e8..ff8a63e371c8ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.srem.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vrem_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vrem_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vrem.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.srem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
index ee11307bddc88c..b5eec4142c7824 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vremu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vremu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.urem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
index c4b7c1f2f19f0f..16a0fddfa98277 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
@@ -10,9 +10,8 @@ define <8 x i7> @vsll_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex
 ; CHECK-LABEL: vsll_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.shl.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
index 7ea5b1f0b505a3..180fafa9659b1c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
@@ -10,11 +10,10 @@ define <8 x i7> @vsra_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex
 ; CHECK-LABEL: vsra_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.ashr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
index 9f9d4af0cc2f3f..22f04803eadd74 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vsrl_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex
 ; CHECK-LABEL: vsrl_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.lshr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index bc5617957d7d08..2c5a3dfffc2cfc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -1282,18 +1282,17 @@ define <vscale x 1 x i9> @fshr_v1i9(<vscale x 1 x i9> %a, <vscale x 1 x i9> %b,
 ; CHECK-LABEL: fshr_v1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a1, v0.t
 ; CHECK-NEXT:    li a0, 9
 ; CHECK-NEXT:    vremu.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vadd.vi v10, v10, 7, v0.t
 ; CHECK-NEXT:    vand.vi v11, v10, 15, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
 ; CHECK-NEXT:    vsrl.vv v9, v9, v11, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vnot.v v10, v10, v0.t
 ; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
@@ -1306,18 +1305,17 @@ define <vscale x 1 x i9> @fshl_v1i9(<vscale x 1 x i9> %a, <vscale x 1 x i9> %b,
 ; CHECK-LABEL: fshl_v1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a1, v0.t
 ; CHECK-NEXT:    li a0, 9
 ; CHECK-NEXT:    vremu.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vnot.v v11, v10, v0.t
-; CHECK-NEXT:    vand.vi v11, v11, 15, v0.t
-; CHECK-NEXT:    vsrl.vv v9, v9, v11, v0.t
+; CHECK-NEXT:    vand.vi v11, v10, 15, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    vnot.v v10, v10, v0.t
 ; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
-; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
+; CHECK-NEXT:    vsrl.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %res = call <vscale x 1 x i9> @llvm.vp.fshl.nxv1i9(<vscale x 1 x i9> %a, <vscale x 1 x i9> %b, <vscale x 1 x i9> %c, <vscale x 1 x i1> %m, i32 %evl)
@@ -1330,15 +1328,14 @@ declare <vscale x 1 x i4> @llvm.vp.fshr.nxv1i4(<vscale x 1 x i4>, <vscale x 1 x
 define <vscale x 1 x i8> @fshr_v1i4(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fshr_v1i4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vand.vi v10, v10, 15
-; CHECK-NEXT:    li a1, 4
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vremu.vx v10, v10, a1, v0.t
+; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
 ; CHECK-NEXT:    vand.vi v9, v9, 15, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vremu.vx v9, v10, a0, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    ret
   %trunca = call <vscale x 1 x i4> @llvm.vp.trunc.nxv1i4.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i1> %m, i32 zeroext %evl)
@@ -1353,15 +1350,14 @@ declare <vscale x 1 x i4> @llvm.vp.fshl.nxv1i4(<vscale x 1 x i4>, <vscale x 1 x
 define <vscale x 1 x i8> @fshl_v1i4(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fshl_v1i4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vand.vi v10, v10, 15
-; CHECK-NEXT:    li a1, 4
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vremu.vx v10, v10, a1, v0.t
+; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
 ; CHECK-NEXT:    vand.vi v9, v9, 15, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vremu.vx v9, v10, a0, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
index 26089706cf99ef..a4b7ca7f39768f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.sdiv.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vdiv_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vdiv_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vdiv.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vdiv.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
index f41b885a66eaae..67c3f9dbf2869a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vdivu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vdivu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 8a76467986620c..c15caa31bb0986 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.smax.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vmax_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vmax.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index 1c74887c1b20fb..df494f8af7387c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vmaxu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vmaxu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 1c71242c3c7d79..794a21c7c6abac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.smin.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vmin_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vmin.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 6d89a9777cf917..d54de281a7fd28 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vminu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vminu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
index cf85fd827b51f1..2ef96f4b3896fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.srem.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vrem_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vrem_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vrem.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vrem.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
index 61bdd5b8d3c8a7..1f1ed4a1269acb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vremu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vremu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
index c04d5ea2da3c1b..380835494ed17d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
@@ -12,8 +12,8 @@ define <vscale x 8 x i7> @vsll_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
index 632c4db5c5bb57..cff8cc710d21f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
@@ -9,13 +9,14 @@ declare <vscale x 8 x i7> @llvm.vp.ashr.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vsra_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vsra_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
index ec5b7f3faf7ca8..ff6771b643031f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vsrl_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK-LABEL: vsrl_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0

From 0e96eebc7f681a7ce41f35909e609c7c61a11455 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Tue, 28 May 2024 12:52:45 -0700
Subject: [PATCH 56/89] [msan] Reland: Increase k num stack origin descrs
 (limited to non-PowerPC) (#93117)

The original pull request
(https://github.com/llvm/llvm-project/pull/92838) was reverted due to a
PowerPC buildbot breakage
(https://github.com/llvm/llvm-project/commit/df626dd11c360c58eddae813ce6a0524d0a53696).
This reland limits the scope of the change to non-PowerPC platforms. I
am unaware of any PowerPC use cases that would benefit from a larger
kNumStackOriginDescrs constant.

Original CL description: This increases the constant size of
kNumStackOriginDescrs to 4M (64GB of BSS across two arrays), which ought
to be enough for anybody.

This is the easier alternative suggested by eugenis@ in
https://github.com/llvm/llvm-project/pull/92826.
---
 compiler-rt/lib/msan/msan.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp
index a2fc27de1901b4..9375e27d4f4d24 100644
--- a/compiler-rt/lib/msan/msan.cpp
+++ b/compiler-rt/lib/msan/msan.cpp
@@ -100,7 +100,17 @@ int msan_report_count = 0;
 
 // Array of stack origins.
 // FIXME: make it resizable.
-static const uptr kNumStackOriginDescrs = 1024 * 1024;
+// Although BSS memory doesn't cost anything until used, it is limited to 2GB
+// in some configurations (e.g., "relocation R_X86_64_PC32 out of range:
+// ... is not in [-2147483648, 2147483647]; references section '.bss'").
+// We use kNumStackOriginDescrs * (sizeof(char*) + sizeof(uptr)) == 64MB.
+#ifdef SANITIZER_PPC
+// soft_rss_limit test (release_origin.c) fails on PPC if kNumStackOriginDescrs
+// is too high
+static const uptr kNumStackOriginDescrs = 1 * 1024 * 1024;
+#else
+static const uptr kNumStackOriginDescrs = 4 * 1024 * 1024;
+#endif  // SANITIZER_PPC
 static const char *StackOriginDescr[kNumStackOriginDescrs];
 static uptr StackOriginPC[kNumStackOriginDescrs];
 static atomic_uint32_t NumStackOriginDescrs;

From d9dec109375ded13d61da20877c399fb8fbb877d Mon Sep 17 00:00:00 2001
From: Lucile Rose Nihlen <luci.the.rose@gmail.com>
Date: Tue, 28 May 2024 19:53:21 +0000
Subject: [PATCH 57/89] [ci] limit parallel windows compile jobs to 24 (#93329)

This is an experiment to see if we can prevent some of the compiler OOMs
happening without unduly impacting the Windows build latency.
---
 .ci/monolithic-windows.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 4fd88ea81c84a8..91e719c52d4363 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -44,6 +44,8 @@ pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt
 # see https://github.com/llvm/llvm-project/pull/82393 and
 # https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40
 # for further information.
+# We limit the number of parallel compile jobs to 24 control memory
+# consumption and improve build reliability.
 cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_ENABLE_PROJECTS="${projects}" \
       -G Ninja \
@@ -58,7 +60,9 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
       -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \
-      -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO"
+      -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \
+      -D LLVM_PARALLEL_COMPILE_JOBS=16 \
+      -D LLVM_PARALLEL_LINK_JOBS=4
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.

From c96860aea2c77392bad16f1c4f55014164669de3 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Tue, 28 May 2024 22:09:34 +0200
Subject: [PATCH 58/89] [clang-tidy] Optimize realpath in
 readability-identifier-naming (#92659)

- Reduce disk IO usage by adding cache to an realpath introduced by
#81985
---
 .../clang-tidy/readability/IdentifierNamingCheck.cpp | 12 ++++++++++--
 .../clang-tidy/readability/IdentifierNamingCheck.h   |  2 ++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index c3208392df1566..828f13805a6980 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -1414,13 +1414,21 @@ IdentifierNamingCheck::getDiagInfo(const NamingCheckId &ID,
                   }};
 }
 
+StringRef IdentifierNamingCheck::getRealFileName(StringRef FileName) const {
+  auto Iter = RealFileNameCache.try_emplace(FileName);
+  SmallString<256U> &RealFileName = Iter.first->getValue();
+  if (!Iter.second)
+    return RealFileName;
+  llvm::sys::fs::real_path(FileName, RealFileName);
+  return RealFileName;
+}
+
 const IdentifierNamingCheck::FileStyle &
 IdentifierNamingCheck::getStyleForFile(StringRef FileName) const {
   if (!GetConfigPerFile)
     return *MainFileStyle;
 
-  SmallString<128> RealFileName;
-  llvm::sys::fs::real_path(FileName, RealFileName);
+  StringRef RealFileName = getRealFileName(FileName);
   StringRef Parent = llvm::sys::path::parent_path(RealFileName);
   auto Iter = NamingStylesCache.find(Parent);
   if (Iter != NamingStylesCache.end())
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
index 27c8e4bc768c40..646ec0eac8dd1c 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
@@ -205,6 +205,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
                        const NamingCheckFailure &Failure) const override;
 
   const FileStyle &getStyleForFile(StringRef FileName) const;
+  StringRef getRealFileName(StringRef FileName) const;
 
   /// Find the style kind of a field in an anonymous record.
   StyleKind findStyleKindForAnonField(
@@ -222,6 +223,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
   /// Stores the style options as a vector, indexed by the specified \ref
   /// StyleKind, for a given directory.
   mutable llvm::StringMap<FileStyle> NamingStylesCache;
+  mutable llvm::StringMap<SmallString<256U>> RealFileNameCache;
   FileStyle *MainFileStyle;
   ClangTidyContext *Context;
   const bool GetConfigPerFile;

From 0aacef3abc41cfc8efb5f1b9483bc37599352a59 Mon Sep 17 00:00:00 2001
From: Mattan Elkaim <73639004+mattanelkaim@users.noreply.github.com>
Date: Tue, 28 May 2024 23:19:01 +0300
Subject: [PATCH 59/89] [clang-tidy][NFC] Update identifier-length.rst (#93467)

Swapped code blocks of parameter and variable, which have been confused
(in a clang-tidy doc file)
---
 .../checks/readability/identifier-length.rst           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst
index 44d97f7b363bff..271970c292c8fa 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst
@@ -28,10 +28,7 @@ The following options are described below:
 
     .. code-block:: c++
 
-         int doubler(int x)   // warns that x is too short
-         {
-            return 2 * x;
-         }
+      int i = 42;    // warns that 'i' is too short
 
     This check does not have any fix suggestions in the general case since
     variable names have semantic value.
@@ -50,7 +47,10 @@ The following options are described below:
 
     .. code-block:: c++
 
-      int i = 42;    // warns that 'i' is too short
+         int doubler(int x)   // warns that x is too short
+         {
+            return 2 * x;
+         }
 
     This check does not have any fix suggestions in the general case since
     variable names have semantic value.

From c108c1e94580d70e2be66172ab4397fcff004376 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 28 May 2024 13:26:36 -0700
Subject: [PATCH 60/89] [WebAssembly] Rename old EH tests to *-legacy (#93585)

I think test files for the legacy and the new EH (exnref) are better be
separate, and I'd like to use the current test file names for the new
EH, rather than keeping the current files and naming the new ones as
`-new` or something.
---
 .../WebAssembly/{cfg-stackify-eh.ll => cfg-stackify-eh-legacy.ll} | 0
 .../CodeGen/WebAssembly/{exception.ll => exception-legacy.ll}     | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/CodeGen/WebAssembly/{cfg-stackify-eh.ll => cfg-stackify-eh-legacy.ll} (100%)
 rename llvm/test/CodeGen/WebAssembly/{exception.ll => exception-legacy.ll} (100%)

diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
similarity index 100%
rename from llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
rename to llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
similarity index 100%
rename from llvm/test/CodeGen/WebAssembly/exception.ll
rename to llvm/test/CodeGen/WebAssembly/exception-legacy.ll

From 9e89d107a6ec2ade15eddb549fa473cf09bf230e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 28 May 2024 13:30:00 -0700
Subject: [PATCH 61/89] [memprof] Add MemProf format Version 3 (#93608)

This patch adds Version 3 for development purposes.  For now, this
patch adds V3 as a copy of V2.

For the most part, this patch adds "case Version3:" wherever "case
Version2:" appears.  One exception is writeMemProfV3, which is copied
from writeMemProfV2 but updated to write out memprof::Version3 to the
MemProf header.  We'll incrementally modify writeMemProfV3 in
subsequent patches.
---
 llvm/include/llvm/ProfileData/MemProf.h       |  4 +-
 llvm/lib/ProfileData/InstrProfReader.cpp      |  4 +-
 llvm/lib/ProfileData/InstrProfWriter.cpp      | 52 +++++++++++++++++++
 llvm/lib/ProfileData/MemProf.cpp              |  4 ++
 .../llvm-profdata/memprof-merge-versions.test |  6 +++
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  3 +-
 6 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 17cef15344285b..d44a2d1e2fb117 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -28,10 +28,12 @@ enum IndexedVersion : uint64_t {
   Version1 = 1,
   // Version 2: Added a call stack table.
   Version2 = 2,
+  // Version 3: Under development.
+  Version3 = 3,
 };
 
 constexpr uint64_t MinimumSupportedVersion = Version0;
-constexpr uint64_t MaximumSupportedVersion = Version2;
+constexpr uint64_t MaximumSupportedVersion = Version3;
 
 // Verify that the minimum and maximum satisfy the obvious constraint.
 static_assert(MinimumSupportedVersion <= MaximumSupportedVersion);
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 836206a4fd86e2..798236c295194a 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1212,7 +1212,8 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start,
   const uint64_t FirstWord =
       support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
-  if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2) {
+  if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2 ||
+      FirstWord == memprof::Version3) {
     // Everything is good.  We can proceed to deserialize the rest.
     Version = static_cast<memprof::IndexedVersion>(FirstWord);
   } else if (FirstWord >= 24) {
@@ -1559,6 +1560,7 @@ IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const {
            "MemProfCallStackTable must not be available");
     return getMemProfRecordV0(IndexedRecord, *MemProfFrameTable);
   case memprof::Version2:
+  case memprof::Version3:
     assert(MemProfFrameTable && "MemProfFrameTable must be available");
     assert(MemProfCallStackTable && "MemProfCallStackTable must be available");
     return getMemProfRecordV2(IndexedRecord, *MemProfFrameTable,
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index b67a9700b680ab..b16714ae8b9a2d 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -617,6 +617,56 @@ static Error writeMemProfV2(ProfOStream &OS,
   return Error::success();
 }
 
+// Write out MemProf Version3 as follows:
+// uint64_t Version
+// uint64_t RecordTableOffset = RecordTableGenerator.Emit
+// uint64_t FramePayloadOffset = Offset for the frame payload
+// uint64_t FrameTableOffset = FrameTableGenerator.Emit
+// uint64_t CallStackPayloadOffset = Offset for the call stack payload
+// uint64_t CallStackTableOffset = CallStackTableGenerator.Emit
+// uint64_t Num schema entries
+// uint64_t Schema entry 0
+// uint64_t Schema entry 1
+// ....
+// uint64_t Schema entry N - 1
+// OnDiskChainedHashTable MemProfRecordData
+// OnDiskChainedHashTable MemProfFrameData
+// OnDiskChainedHashTable MemProfCallStackData
+static Error writeMemProfV3(ProfOStream &OS,
+                            memprof::IndexedMemProfData &MemProfData,
+                            bool MemProfFullSchema) {
+  OS.write(memprof::Version3);
+  uint64_t HeaderUpdatePos = OS.tell();
+  OS.write(0ULL); // Reserve space for the memprof record table offset.
+  OS.write(0ULL); // Reserve space for the memprof frame payload offset.
+  OS.write(0ULL); // Reserve space for the memprof frame table offset.
+  OS.write(0ULL); // Reserve space for the memprof call stack payload offset.
+  OS.write(0ULL); // Reserve space for the memprof call stack table offset.
+
+  auto Schema = memprof::getHotColdSchema();
+  if (MemProfFullSchema)
+    Schema = memprof::getFullSchema();
+  writeMemProfSchema(OS, Schema);
+
+  uint64_t RecordTableOffset = writeMemProfRecords(OS, MemProfData.RecordData,
+                                                   &Schema, memprof::Version3);
+
+  uint64_t FramePayloadOffset = OS.tell();
+  uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfData.FrameData);
+
+  uint64_t CallStackPayloadOffset = OS.tell();
+  uint64_t CallStackTableOffset =
+      writeMemProfCallStacks(OS, MemProfData.CallStackData);
+
+  uint64_t Header[] = {
+      RecordTableOffset,      FramePayloadOffset,   FrameTableOffset,
+      CallStackPayloadOffset, CallStackTableOffset,
+  };
+  OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
+
+  return Error::success();
+}
+
 // Write out the MemProf data in a requested version.
 static Error writeMemProf(ProfOStream &OS,
                           memprof::IndexedMemProfData &MemProfData,
@@ -629,6 +679,8 @@ static Error writeMemProf(ProfOStream &OS,
     return writeMemProfV1(OS, MemProfData);
   case memprof::Version2:
     return writeMemProfV2(OS, MemProfData, MemProfFullSchema);
+  case memprof::Version3:
+    return writeMemProfV3(OS, MemProfData, MemProfFullSchema);
   }
 
   return make_error<InstrProfError>(
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 89afe7c39027c6..2f0e53736c82e5 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -52,6 +52,7 @@ size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema,
   case Version1:
     return serializedSizeV0(*this, Schema);
   case Version2:
+  case Version3:
     return serializedSizeV2(*this, Schema);
   }
   llvm_unreachable("unsupported MemProf version");
@@ -95,6 +96,7 @@ size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema,
   case Version1:
     return serializedSizeV0(*this, Schema);
   case Version2:
+  case Version3:
     return serializedSizeV2(*this, Schema);
   }
   llvm_unreachable("unsupported MemProf version");
@@ -149,6 +151,7 @@ void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
     serializeV0(*this, Schema, OS);
     return;
   case Version2:
+  case Version3:
     serializeV2(*this, Schema, OS);
     return;
   }
@@ -239,6 +242,7 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
   case Version1:
     return deserializeV0(Schema, Ptr);
   case Version2:
+  case Version3:
     return deserializeV2(Schema, Ptr);
   }
   llvm_unreachable("unsupported MemProf version");
diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-versions.test b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
index 28f65e0781bc63..aa7d0329425dc5 100644
--- a/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
+++ b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
@@ -19,6 +19,12 @@ RUN: llvm-profdata show %t.prof.v2 | FileCheck %s
 RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=2 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v2
 RUN: llvm-profdata show %t.prof.v2 | FileCheck %s
 
+RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3
+RUN: llvm-profdata show %t.prof.v3 | FileCheck %s
+
+RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3
+RUN: llvm-profdata show %t.prof.v3 | FileCheck %s
+
 For now we only check the validity of the instrumented profile since we don't
 have a way to display the contents of the memprof indexed format yet.
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 28c3afa1016473..fae6d1e989ab5a 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -306,7 +306,8 @@ cl::opt<memprof::IndexedVersion> MemProfVersionRequested(
     cl::init(memprof::Version0),
     cl::values(clEnumValN(memprof::Version0, "0", "version 0"),
                clEnumValN(memprof::Version1, "1", "version 1"),
-               clEnumValN(memprof::Version2, "2", "version 2")));
+               clEnumValN(memprof::Version2, "2", "version 2"),
+               clEnumValN(memprof::Version3, "3", "version 3")));
 
 cl::opt<bool> MemProfFullSchema(
     "memprof-full-schema", cl::Hidden, cl::sub(MergeSubcommand),

From 193e9007ef0bef6c881ab26746221f22ec674447 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Tue, 28 May 2024 13:18:46 -0700
Subject: [PATCH 62/89] [OpenACC][NFC] Fix begin loc and split it from the
 directive location

I discovered while working on something else that we were using the
location of the directive name as the 'beginloc' which caused some
problems in a few places.  This patch makes it so our beginloc is the
'#' as we originally designed, and then adds a DirectiveLoc concept to a
construct for use diagnosing the name.
---
 clang/include/clang/AST/StmtOpenACC.h     | 32 ++++++++++++++---------
 clang/include/clang/Parse/Parser.h        |  1 +
 clang/include/clang/Sema/SemaOpenACC.h    |  3 ++-
 clang/lib/AST/StmtOpenACC.cpp             | 13 +++++----
 clang/lib/Parse/ParseOpenACC.cpp          | 19 +++++++-------
 clang/lib/Sema/SemaOpenACC.cpp            |  7 ++---
 clang/lib/Sema/TreeTransform.h            |  9 ++++---
 clang/lib/Serialization/ASTReaderStmt.cpp |  1 +
 clang/lib/Serialization/ASTWriterStmt.cpp |  1 +
 9 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index b706864798baaf..04daf511f58713 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -31,6 +31,8 @@ class OpenACCConstructStmt : public Stmt {
   /// The location of the directive statement, from the '#' to the last token of
   /// the directive.
   SourceRange Range;
+  /// The location of the directive name.
+  SourceLocation DirectiveLoc;
 
   /// The list of clauses.  This is stored here as an ArrayRef, as this is the
   /// most convienient place to access the list, however the list itself should
@@ -39,8 +41,9 @@ class OpenACCConstructStmt : public Stmt {
 
 protected:
   OpenACCConstructStmt(StmtClass SC, OpenACCDirectiveKind K,
-                       SourceLocation Start, SourceLocation End)
-      : Stmt(SC), Kind(K), Range(Start, End) {}
+                       SourceLocation Start, SourceLocation DirectiveLoc,
+                       SourceLocation End)
+      : Stmt(SC), Kind(K), Range(Start, End), DirectiveLoc(DirectiveLoc) {}
 
   // Used only for initialization, the leaf class can initialize this to
   // trailing storage.
@@ -59,6 +62,7 @@ class OpenACCConstructStmt : public Stmt {
 
   SourceLocation getBeginLoc() const { return Range.getBegin(); }
   SourceLocation getEndLoc() const { return Range.getEnd(); }
+  SourceLocation getDirectiveLoc() const { return DirectiveLoc; }
   ArrayRef<const OpenACCClause *> clauses() const { return Clauses; }
 
   child_range children() {
@@ -81,9 +85,11 @@ class OpenACCAssociatedStmtConstruct : public OpenACCConstructStmt {
 
 protected:
   OpenACCAssociatedStmtConstruct(StmtClass SC, OpenACCDirectiveKind K,
-                                 SourceLocation Start, SourceLocation End,
-                                 Stmt *AssocStmt)
-      : OpenACCConstructStmt(SC, K, Start, End), AssociatedStmt(AssocStmt) {}
+                                 SourceLocation Start,
+                                 SourceLocation DirectiveLoc,
+                                 SourceLocation End, Stmt *AssocStmt)
+      : OpenACCConstructStmt(SC, K, Start, DirectiveLoc, End),
+        AssociatedStmt(AssocStmt) {}
 
   void setAssociatedStmt(Stmt *S) { AssociatedStmt = S; }
   Stmt *getAssociatedStmt() { return AssociatedStmt; }
@@ -126,10 +132,10 @@ class OpenACCComputeConstruct final
   friend class ASTStmtReader;
   friend class ASTContext;
   OpenACCComputeConstruct(unsigned NumClauses)
-      : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass,
-                                       OpenACCDirectiveKind::Invalid,
-                                       SourceLocation{}, SourceLocation{},
-                                       /*AssociatedStmt=*/nullptr) {
+      : OpenACCAssociatedStmtConstruct(
+            OpenACCComputeConstructClass, OpenACCDirectiveKind::Invalid,
+            SourceLocation{}, SourceLocation{}, SourceLocation{},
+            /*AssociatedStmt=*/nullptr) {
     // We cannot send the TrailingObjects storage to the base class (which holds
     // a reference to the data) until it is constructed, so we have to set it
     // separately here.
@@ -141,11 +147,11 @@ class OpenACCComputeConstruct final
   }
 
   OpenACCComputeConstruct(OpenACCDirectiveKind K, SourceLocation Start,
-                          SourceLocation End,
+                          SourceLocation DirectiveLoc, SourceLocation End,
                           ArrayRef<const OpenACCClause *> Clauses,
                           Stmt *StructuredBlock)
       : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, K, Start,
-                                       End, StructuredBlock) {
+                                       DirectiveLoc, End, StructuredBlock) {
     assert(isOpenACCComputeDirectiveKind(K) &&
            "Only parallel, serial, and kernels constructs should be "
            "represented by this type");
@@ -169,8 +175,8 @@ class OpenACCComputeConstruct final
                                               unsigned NumClauses);
   static OpenACCComputeConstruct *
   Create(const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc,
-         SourceLocation EndLoc, ArrayRef<const OpenACCClause *> Clauses,
-         Stmt *StructuredBlock);
+         SourceLocation DirectiveLoc, SourceLocation EndLoc,
+         ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock);
 
   Stmt *getStructuredBlock() { return getAssociatedStmt(); }
   const Stmt *getStructuredBlock() const {
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 00b475e5b42824..d054b8cf0d2405 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3659,6 +3659,7 @@ class Parser : public CodeCompletionHandler {
   struct OpenACCDirectiveParseInfo {
     OpenACCDirectiveKind DirKind;
     SourceLocation StartLoc;
+    SourceLocation DirLoc;
     SourceLocation EndLoc;
     SmallVector<OpenACCClause *> Clauses;
     // TODO OpenACC: As we implement support for the Atomic, Routine, Cache, and
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index 6f69fa08939b82..66144de4340a8a 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -379,7 +379,7 @@ class SemaOpenACC : public SemaBase {
   /// Called after the construct has been parsed, but clauses haven't been
   /// parsed.  This allows us to diagnose not-implemented, as well as set up any
   /// state required for parsing the clauses.
-  void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation StartLoc);
+  void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation DirLoc);
 
   /// Called after the directive, including its clauses, have been parsed and
   /// parsing has consumed the 'annot_pragma_openacc_end' token. This DOES
@@ -400,6 +400,7 @@ class SemaOpenACC : public SemaBase {
   /// declaration group or associated statement.
   StmtResult ActOnEndStmtDirective(OpenACCDirectiveKind K,
                                    SourceLocation StartLoc,
+                                   SourceLocation DirLoc,
                                    SourceLocation EndLoc,
                                    ArrayRef<OpenACCClause *> Clauses,
                                    StmtResult AssocStmt);
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index a381a8dd7b62c3..47899b344c97ab 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -23,15 +23,14 @@ OpenACCComputeConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) {
   return Inst;
 }
 
-OpenACCComputeConstruct *
-OpenACCComputeConstruct::Create(const ASTContext &C, OpenACCDirectiveKind K,
-                                SourceLocation BeginLoc, SourceLocation EndLoc,
-                                ArrayRef<const OpenACCClause *> Clauses,
-                                Stmt *StructuredBlock) {
+OpenACCComputeConstruct *OpenACCComputeConstruct::Create(
+    const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc,
+    SourceLocation DirLoc, SourceLocation EndLoc,
+    ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock) {
   void *Mem = C.Allocate(
       OpenACCComputeConstruct::totalSizeToAlloc<const OpenACCClause *>(
           Clauses.size()));
-  auto *Inst = new (Mem)
-      OpenACCComputeConstruct(K, BeginLoc, EndLoc, Clauses, StructuredBlock);
+  auto *Inst = new (Mem) OpenACCComputeConstruct(K, BeginLoc, DirLoc, EndLoc,
+                                                 Clauses, StructuredBlock);
   return Inst;
 }
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index e9c60f76165b68..63afc18783a1f7 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -1347,11 +1347,13 @@ void Parser::ParseOpenACCCacheVarList() {
   ParseOpenACCVarList(OpenACCClauseKind::Invalid);
 }
 
-Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
-  SourceLocation StartLoc = getCurToken().getLocation();
+Parser::OpenACCDirectiveParseInfo
+Parser::ParseOpenACCDirective() {
+  SourceLocation StartLoc = ConsumeAnnotationToken();
+  SourceLocation DirLoc = getCurToken().getLocation();
   OpenACCDirectiveKind DirKind = ParseOpenACCDirectiveKind(*this);
 
-  getActions().OpenACC().ActOnConstruct(DirKind, StartLoc);
+  getActions().OpenACC().ActOnConstruct(DirKind, DirLoc);
 
   // Once we've parsed the construct/directive name, some have additional
   // specifiers that need to be taken care of. Atomic has an 'atomic-clause'
@@ -1390,7 +1392,7 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
       break;
     case OpenACCDirectiveKind::Wait:
       // OpenACC has an optional paren-wrapped 'wait-argument'.
-      if (ParseOpenACCWaitArgument(StartLoc, /*IsDirective=*/true).Failed)
+      if (ParseOpenACCWaitArgument(DirLoc, /*IsDirective=*/true).Failed)
         T.skipToEnd();
       else
         T.consumeClose();
@@ -1404,7 +1406,8 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
   }
 
   // Parses the list of clauses, if present, plus set up return value.
-  OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, SourceLocation{},
+  OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, DirLoc,
+                                      SourceLocation{},
                                       ParseOpenACCClauseList(DirKind)};
 
   assert(Tok.is(tok::annot_pragma_openacc_end) &&
@@ -1421,7 +1424,6 @@ Parser::DeclGroupPtrTy Parser::ParseOpenACCDirectiveDecl() {
   assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token");
 
   ParsingOpenACCDirectiveRAII DirScope(*this);
-  ConsumeAnnotationToken();
 
   OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective();
 
@@ -1438,7 +1440,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() {
   assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token");
 
   ParsingOpenACCDirectiveRAII DirScope(*this);
-  ConsumeAnnotationToken();
 
   OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective();
   if (getActions().OpenACC().ActOnStartStmtDirective(DirInfo.DirKind,
@@ -1456,6 +1457,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() {
   }
 
   return getActions().OpenACC().ActOnEndStmtDirective(
-      DirInfo.DirKind, DirInfo.StartLoc, DirInfo.EndLoc, DirInfo.Clauses,
-      AssocStmt);
+      DirInfo.DirKind, DirInfo.StartLoc, DirInfo.DirLoc, DirInfo.EndLoc,
+      DirInfo.Clauses, AssocStmt);
 }
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 09d91b31cfe5f9..15239f4f35c39f 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -844,7 +844,7 @@ ExprResult SemaOpenACC::CheckReductionVar(Expr *VarExpr) {
 }
 
 void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
-                                 SourceLocation StartLoc) {
+                                 SourceLocation DirLoc) {
   switch (K) {
   case OpenACCDirectiveKind::Invalid:
     // Nothing to do here, an invalid kind has nothing we can check here.  We
@@ -859,7 +859,7 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
     // here as these constructs do not take any arguments.
     break;
   default:
-    Diag(StartLoc, diag::warn_acc_construct_unimplemented) << K;
+    Diag(DirLoc, diag::warn_acc_construct_unimplemented) << K;
     break;
   }
 }
@@ -1265,6 +1265,7 @@ bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K,
 
 StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
                                               SourceLocation StartLoc,
+                                              SourceLocation DirLoc,
                                               SourceLocation EndLoc,
                                               ArrayRef<OpenACCClause *> Clauses,
                                               StmtResult AssocStmt) {
@@ -1278,7 +1279,7 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::Kernels:
     // TODO OpenACC: Add clauses to the construct here.
     return OpenACCComputeConstruct::Create(
-        getASTContext(), K, StartLoc, EndLoc, Clauses,
+        getASTContext(), K, StartLoc, DirLoc, EndLoc, Clauses,
         AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
   }
   llvm_unreachable("Unhandled case in directive handling?");
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index dee335b526991b..765e6177d202d1 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4033,11 +4033,12 @@ class TreeTransform {
 
   StmtResult RebuildOpenACCComputeConstruct(OpenACCDirectiveKind K,
                                             SourceLocation BeginLoc,
+                                            SourceLocation DirLoc,
                                             SourceLocation EndLoc,
                                             ArrayRef<OpenACCClause *> Clauses,
                                             StmtResult StrBlock) {
-    return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, EndLoc,
-                                                     Clauses, StrBlock);
+    return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, DirLoc,
+                                                     EndLoc, Clauses, StrBlock);
   }
 
 private:
@@ -11559,8 +11560,8 @@ StmtResult TreeTransform<Derived>::TransformOpenACCComputeConstruct(
       getSema().OpenACC().ActOnAssociatedStmt(C->getDirectiveKind(), StrBlock);
 
   return getDerived().RebuildOpenACCComputeConstruct(
-      C->getDirectiveKind(), C->getBeginLoc(), C->getEndLoc(),
-      TransformedClauses, StrBlock);
+      C->getDirectiveKind(), C->getBeginLoc(), C->getDirectiveLoc(),
+      C->getEndLoc(), TransformedClauses, StrBlock);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index eac4faff285490..bea2b949891070 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2797,6 +2797,7 @@ void ASTStmtReader::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
   (void)Record.readInt();
   S->Kind = Record.readEnum<OpenACCDirectiveKind>();
   S->Range = Record.readSourceRange();
+  S->DirectiveLoc = Record.readSourceLocation();
   Record.readOpenACCClauseList(S->Clauses);
 }
 
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index a44852af97bea3..3c586b270fbf4f 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2847,6 +2847,7 @@ void ASTStmtWriter::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
   Record.push_back(S->clauses().size());
   Record.writeEnum(S->Kind);
   Record.AddSourceRange(S->Range);
+  Record.AddSourceLocation(S->DirectiveLoc);
   Record.writeOpenACCClauseList(S->clauses());
 }
 

From 5a23d31c5033dcb41d374692ed26d87ed8e2665a Mon Sep 17 00:00:00 2001
From: William Junda Huang <williamjhuang@google.com>
Date: Tue, 28 May 2024 16:41:53 -0400
Subject: [PATCH 63/89] [Sample Profile] Check hot callsite threshold when
 inlining a function with a sample profile (#93286)

Currently if a callsite is hot as determined by the sample profile, it
is unconditionally inlined barring invalid cases (such as recursion).
Inline cost check should still apply because a function's hotness and
its inline cost are two different things.
For example if a function is calling another very large function
multiple times (at different code paths), the large function should not
be inlined even if its hot.
---
 llvm/lib/Transforms/IPO/SampleProfile.cpp     |  7 ++-
 .../Inputs/inline-hot-callsite-threshold.prof |  3 +
 .../inline-hot-callsite-threshold.ll          | 61 +++++++++++++++++++
 .../SampleProfile/pseudo-probe-inline.ll      |  2 +-
 llvm/test/Transforms/SampleProfile/remarks.ll |  4 +-
 5 files changed, 71 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll

diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 0920179fb76b73..92ad4c34da6e7e 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1391,10 +1391,11 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
       return InlineCost::getAlways("preinliner");
   }
 
-  // For old FDO inliner, we inline the call site as long as cost is not
-  // "Never". The cost-benefit check is done earlier.
+  // For old FDO inliner, we inline the call site if it is below hot threshold,
+  // even if the function is hot based on sample profile data. This is to
+  // prevent huge functions from being inlined.
   if (!CallsitePrioritizedInline) {
-    return InlineCost::get(Cost.getCost(), INT_MAX);
+    return InlineCost::get(Cost.getCost(), SampleHotCallSiteThreshold);
   }
 
   // Otherwise only use the cost from call analyzer, but overwite threshold with
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof
new file mode 100644
index 00000000000000..d1c0408210f498
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof
@@ -0,0 +1,3 @@
+foo:100:100
+ 1: bar:100
+  1:100
diff --git a/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll
new file mode 100644
index 00000000000000..914ab4f1e3da58
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=100 2>&1 | FileCheck %s
+
+; CHECK: remark: a.cc:6:12: 'bar' inlined into 'foo' to match profiling context with (cost={{.*}}, threshold=100)
+; CHECK:     define dso_local noundef i32 @foo(i32 noundef %0)
+; CHECK-NOT:   %2 = tail call noundef i32 @bar(i32 noundef %0)
+; CHECK-NEXT:  %2 = icmp sgt i32 %0, 1
+; CHECK-NEXT:  br i1 %2, label %3, label %bar.exit
+
+; Manually lower cost threshold for hot function inlining, so that the function
+; is not inlined even profile indicates it as hot.
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=1 2>&1 | FileCheck %s --check-prefix=COST
+
+; COST-NOT:  remark
+; COST: define dso_local noundef i32 @foo(i32 noundef %0)
+; COST-NEXT: %2 = tail call noundef i32 @bar(i32 noundef %0)
+
+define dso_local noundef i32 @bar(i32 noundef %0) #0 !dbg !10 {
+  %2 = icmp sgt i32 %0, 1
+  br i1 %2, label %3, label %15
+3:                                                ; preds = %1
+  %4 = add nsw i32 %0, -2
+  %5 = mul i32 %4, %4
+  %6 = add i32 %5, %0
+  %7 = zext nneg i32 %4 to i33
+  %8 = add nsw i32 %0, -3
+  %9 = zext i32 %8 to i33
+  %10 = mul i33 %7, %9
+  %11 = lshr i33 %10, 1
+  %12 = trunc nuw i33 %11 to i32
+  %13 = xor i32 %12, -1
+  %14 = add i32 %6, %13
+  br label %15
+15:                                               ; preds = %3, %1
+  %16 = phi i32 [ 0, %1 ], [ %14, %3 ]
+  ret i32 %16
+}
+
+define dso_local noundef i32 @foo(i32 noundef %0) #1 !dbg !20 {
+  %2 = tail call noundef i32 @bar(i32 noundef %0), !dbg !24
+  ret i32 %2
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable  "use-sample-profile" }
+attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable  "use-sample-profile" }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "a.cc", directory: ".")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: !1, file: !1, line: 1, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!11 = !DIFile(filename: "a.cc", directory: ".")
+!12 = !DISubroutineType(types: !13)
+!13 = !{!14, !14}
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !11, file: !11, line: 5, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!23 = !DILocation(line: 0, scope: !20)
+!24 = !DILocation(line: 6, column: 12, scope: !20)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
index 18cbd857d97bb2..2cd9abf0e11e94 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
@@ -98,7 +98,7 @@ if.end:
 ;YAML-NEXT:    - String:          '(cost='
 ;YAML-NEXT:    - Cost:            '15'
 ;YAML-NEXT:    - String:          ', threshold='
-;YAML-NEXT:    - Threshold:       '2147483647'
+;YAML-NEXT:    - Threshold:       '3000'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:    - String:          ' at callsite '
 ;YAML-NEXT:    - String:          foo
diff --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll
index 997e02bb5b5444..9c0143ae65ca77 100644
--- a/llvm/test/Transforms/SampleProfile/remarks.ll
+++ b/llvm/test/Transforms/SampleProfile/remarks.ll
@@ -22,7 +22,7 @@
 
 ; We are expecting foo() to be inlined in main() (almost all the cycles are
 ; spent inside foo).
-; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=2147483647) at callsite main:0:21;
+; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=3000) at callsite main:0:21;
 ; CHECK: remark: remarks.cc:9:19: 'rand' inlined into 'main' to match profiling context with (cost=always): always inline attribute at callsite _Z3foov:6:19 @ main:0:21;
 
 ; The back edge for the loop is the hottest edge in the loop subgraph.
@@ -51,7 +51,7 @@
 ;YAML-NEXT:    - String:          '(cost='
 ;YAML-NEXT:    - Cost:            '130'
 ;YAML-NEXT:    - String:          ', threshold='
-;YAML-NEXT:    - Threshold:       '2147483647'
+;YAML-NEXT:    - Threshold:       '3000'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:    - String:          ' at callsite '
 ;YAML-NEXT:    - String:          main

From 6a47315a3cb2c6d381809f0ba5c89bd8dcdbcaa0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 28 May 2024 22:45:32 +0200
Subject: [PATCH 64/89] [clang-repl] Even more tests create the Interpreter and
 must check host JIT support (#84758)

---
 .../Interpreter/CodeCompletionTest.cpp        | 85 +++++++++++++++++++
 .../Interpreter/IncrementalProcessingTest.cpp |  3 +
 2 files changed, 88 insertions(+)

diff --git a/clang/unittests/Interpreter/CodeCompletionTest.cpp b/clang/unittests/Interpreter/CodeCompletionTest.cpp
index 873fbda32f0579..72c02c683fafd4 100644
--- a/clang/unittests/Interpreter/CodeCompletionTest.cpp
+++ b/clang/unittests/Interpreter/CodeCompletionTest.cpp
@@ -4,6 +4,7 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
 #include "clang/Sema/Sema.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/LineEditor/LineEditor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
@@ -11,6 +12,10 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+#if defined(_AIX) || defined(__MVS__)
+#define CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+#endif
+
 using namespace clang;
 namespace {
 auto CB = clang::IncrementalCompilerBuilder();
@@ -50,7 +55,21 @@ static std::vector<std::string> runComp(clang::Interpreter &MainInterp,
   return Comps;
 }
 
+static bool HostSupportsJit() {
+  auto J = llvm::orc::LLJITBuilder().create();
+  if (J)
+    return true;
+  LLVMConsumeError(llvm::wrap(J.takeError()));
+  return false;
+}
+
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_Sanity) {
+#else
 TEST(CodeCompletionTest, Sanity) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int foo = 12;"));
   auto Err = llvm::Error::success();
@@ -61,7 +80,13 @@ TEST(CodeCompletionTest, Sanity) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_SanityNoneValid) {
+#else
 TEST(CodeCompletionTest, SanityNoneValid) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int foo = 12;"));
   auto Err = llvm::Error::success();
@@ -70,7 +95,13 @@ TEST(CodeCompletionTest, SanityNoneValid) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_TwoDecls) {
+#else
 TEST(CodeCompletionTest, TwoDecls) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int application = 12;"));
   cantFail(Interp->Parse("int apple = 12;"));
@@ -80,14 +111,26 @@ TEST(CodeCompletionTest, TwoDecls) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_CompFunDeclsNoError) {
+#else
 TEST(CodeCompletionTest, CompFunDeclsNoError) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   auto Err = llvm::Error::success();
   auto comps = runComp(*Interp, "void app(", Err);
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_TypedDirected) {
+#else
 TEST(CodeCompletionTest, TypedDirected) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int application = 12;"));
   cantFail(Interp->Parse("char apple = '2';"));
@@ -119,7 +162,13 @@ TEST(CodeCompletionTest, TypedDirected) {
   }
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_SanityClasses) {
+#else
 TEST(CodeCompletionTest, SanityClasses) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("struct Apple{};"));
   cantFail(Interp->Parse("void takeApple(Apple &a1){}"));
@@ -142,7 +191,13 @@ TEST(CodeCompletionTest, SanityClasses) {
   }
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_SubClassing) {
+#else
 TEST(CodeCompletionTest, SubClassing) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("struct Fruit {};"));
   cantFail(Interp->Parse("struct Apple : Fruit{};"));
@@ -157,7 +212,13 @@ TEST(CodeCompletionTest, SubClassing) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_MultipleArguments) {
+#else
 TEST(CodeCompletionTest, MultipleArguments) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int foo = 42;"));
   cantFail(Interp->Parse("char fowl = 'A';"));
@@ -169,7 +230,13 @@ TEST(CodeCompletionTest, MultipleArguments) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_Methods) {
+#else
 TEST(CodeCompletionTest, Methods) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse(
       "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};"));
@@ -183,7 +250,13 @@ TEST(CodeCompletionTest, Methods) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_MethodsInvocations) {
+#else
 TEST(CodeCompletionTest, MethodsInvocations) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse(
       "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};"));
@@ -197,7 +270,13 @@ TEST(CodeCompletionTest, MethodsInvocations) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_NestedInvocations) {
+#else
 TEST(CodeCompletionTest, NestedInvocations) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse(
       "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};"));
@@ -212,7 +291,13 @@ TEST(CodeCompletionTest, NestedInvocations) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_TemplateFunctions) {
+#else
 TEST(CodeCompletionTest, TemplateFunctions) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(
       Interp->Parse("template <typename T> T id(T a) { return a;} "));
diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
index f3b091b0c0e6cb..9a99ff6262fa3c 100644
--- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
+++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
@@ -61,6 +61,9 @@ TEST(IncrementalProcessing, DISABLED_EmitCXXGlobalInitFunc) {
 #else
 TEST(IncrementalProcessing, EmitCXXGlobalInitFunc) {
 #endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
+
   std::vector<const char *> ClangArgv = {"-Xclang", "-emit-llvm-only"};
   auto CB = clang::IncrementalCompilerBuilder();
   CB.SetCompilerArgs(ClangArgv);

From 98fa0f6981f33b7d8f5aa38babc1e71bc0209de8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 28 May 2024 20:40:58 +0200
Subject: [PATCH 65/89] DAG: Handle vector splitting for
 fminnum_ieee/fmaxnum_ieee

Avoids regression in future commit which starts producing
illegal instances.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 14e8708fd3f38f..361416edb554ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1174,8 +1174,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FADD: case ISD::VP_FADD:
   case ISD::FSUB: case ISD::VP_FSUB:
   case ISD::FMUL: case ISD::VP_FMUL:
-  case ISD::FMINNUM: case ISD::VP_FMINNUM:
-  case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
+  case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::VP_FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::VP_FMAXNUM:
   case ISD::FMINIMUM:
   case ISD::VP_FMINIMUM:
   case ISD::FMAXIMUM:

From bbca20f0b1ab7c6ea36a84e88a6abb07f94ca80b Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Tue, 28 May 2024 23:04:12 +0200
Subject: [PATCH 66/89] [Clang][NFC] remove CHAR_PUNCT duplication introduced
 by #93216 (#93605)

---
 clang/include/clang/Basic/CharInfo.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h
index 4d90528f7992e3..d71857e8e5dcc3 100644
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@@ -151,8 +151,7 @@ LLVM_READONLY inline bool isHexDigit(unsigned char c) {
 /// Note that '_' is both a punctuation character and an identifier character!
 LLVM_READONLY inline bool isPunctuation(unsigned char c) {
   using namespace charinfo;
-  return (InfoTable[c] &
-          (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT | CHAR_PUNCT)) != 0;
+  return (InfoTable[c] & (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT)) != 0;
 }
 
 /// Return true if this character is an ASCII printable character; that is, a

From df542e1ed82bd4e5a9e345d3a3ae63a76893a0cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 28 May 2024 23:18:45 +0200
Subject: [PATCH 67/89] Fix build: [clang-repl] Even more tests create the
 Interpreter and must check host JIT support (#84758)

fea7399e97b73a3209fcbe3338d412069769a637 had removed the unused function that was still there when I tested.
---
 clang/unittests/Interpreter/IncrementalProcessingTest.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
index 9a99ff6262fa3c..732753f11306e6 100644
--- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
+++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
@@ -56,6 +56,14 @@ const Function *getGlobalInit(llvm::Module *M) {
   return nullptr;
 }
 
+static bool HostSupportsJit() {
+  auto J = llvm::orc::LLJITBuilder().create();
+  if (J)
+    return true;
+  LLVMConsumeError(llvm::wrap(J.takeError()));
+  return false;
+}
+
 #ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
 TEST(IncrementalProcessing, DISABLED_EmitCXXGlobalInitFunc) {
 #else

From ed4227aad37f2c4adf307b63050fb9aee52b07f8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 28 May 2024 14:37:15 -0700
Subject: [PATCH 68/89] [SCEV] Add tests for symbolic max BTC requiring
 predicates.

Add extra tests for https://github.com/llvm/llvm-project/pull/93498.
---
 ...cated-symbolic-max-backedge-taken-count.ll | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll

diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
new file mode 100644
index 00000000000000..d40416359b65c6
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s
+
+; %i and %i + 1 can overflow.
+define void @test1(i64 %x, ptr %a, ptr %b) {
+; CHECK-LABEL: 'test1'
+; CHECK-NEXT:  Determining loop execution counts for: @test1
+; CHECK-NEXT:  Loop %header: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+;
+entry:
+  br label %header
+
+header:
+  %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ]
+  %i.010 = phi i32 [ 0, %entry ], [ %add, %latch ]
+  %add = add i32 %i.010, 1
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+  %ld = load i32, ptr %arrayidx, align 4
+  %uncountable.c = icmp eq i32 %ld, 10
+  br i1 %uncountable.c, label %exit, label %latch
+
+latch:
+  %add2 = add nsw i32 %ld, 1
+  %arrayidx4 = getelementptr inbounds i32, ptr %b, i64 %conv11
+  store i32 %add2, ptr %arrayidx4, align 4
+  %conv = zext i32 %add to i64
+  %cmp = icmp ult i64 %conv, %x
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; %i can overflow.
+;
+; We need to check that i doesn't wrap, but we don't need a run-time alias
+; check. We also need an extra no-wrap check to get the backedge taken count.
+define void @test2(i64 %x, ptr %a) {
+; CHECK-LABEL: 'test2'
+; CHECK-NEXT:  Determining loop execution counts for: @test2
+; CHECK-NEXT:  Loop %header: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+;
+entry:
+  br label %header
+
+header:
+  %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ]
+  %i.010 = phi i32  [ 0, %entry ], [ %inc, %latch ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %conv11
+  %ld = load i32, ptr %arrayidx, align 4
+  %uncountable.c = icmp eq i32 %ld, 10
+  br i1 %uncountable.c, label %exit, label %latch
+
+latch:
+  %add = add nsw i32 %ld, 1
+  store i32 %add, ptr %arrayidx, align 4
+  %inc = add i32 %i.010, 1
+  %conv = zext i32 %inc to i64
+  %cmp = icmp ult i64 %conv, %x
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret void
+}

From e3f74d4589e29279e9f543b58577a2ece102dc6f Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Tue, 28 May 2024 14:25:13 -0700
Subject: [PATCH 69/89] [OpenACC] Correct serialization of certain clause
 sub-expressions

For some reason I was using writeStmtRef when I meant writeStmt, so this
corrects that.
---
 clang/lib/Serialization/ASTWriter.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index dd548fabfd9551..e830c4026ea78f 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7835,7 +7835,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::If: {
     const auto *IC = cast<OpenACCIfClause>(C);
     writeSourceLocation(IC->getLParenLoc());
-    writeStmtRef(IC->getConditionExpr());
+    AddStmt(const_cast<Expr*>(IC->getConditionExpr()));
     return;
   }
   case OpenACCClauseKind::Self: {
@@ -7843,7 +7843,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeSourceLocation(SC->getLParenLoc());
     writeBool(SC->hasConditionExpr());
     if (SC->hasConditionExpr())
-      writeStmtRef(SC->getConditionExpr());
+      AddStmt(const_cast<Expr*>(SC->getConditionExpr()));
     return;
   }
   case OpenACCClauseKind::NumGangs: {
@@ -7857,13 +7857,13 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::NumWorkers: {
     const auto *NWC = cast<OpenACCNumWorkersClause>(C);
     writeSourceLocation(NWC->getLParenLoc());
-    writeStmtRef(NWC->getIntExpr());
+    AddStmt(const_cast<Expr*>(NWC->getIntExpr()));
     return;
   }
   case OpenACCClauseKind::VectorLength: {
     const auto *NWC = cast<OpenACCVectorLengthClause>(C);
     writeSourceLocation(NWC->getLParenLoc());
-    writeStmtRef(NWC->getIntExpr());
+    AddStmt(const_cast<Expr*>(NWC->getIntExpr()));
     return;
   }
   case OpenACCClauseKind::Private: {
@@ -7942,15 +7942,15 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeSourceLocation(AC->getLParenLoc());
     writeBool(AC->hasIntExpr());
     if (AC->hasIntExpr())
-      writeStmtRef(AC->getIntExpr());
+      AddStmt(const_cast<Expr*>(AC->getIntExpr()));
     return;
   }
   case OpenACCClauseKind::Wait: {
     const auto *WC = cast<OpenACCWaitClause>(C);
     writeSourceLocation(WC->getLParenLoc());
     writeBool(WC->getDevNumExpr());
-    if (const Expr *DNE = WC->getDevNumExpr())
-      writeStmtRef(DNE);
+    if (Expr *DNE = WC->getDevNumExpr())
+      AddStmt(DNE);
     writeSourceLocation(WC->getQueuesLoc());
 
     writeOpenACCIntExprList(WC->getQueueIdExprs());

From 060b3023e198d197b47c652f19af5f7dea3a22cc Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 14:49:57 -0700
Subject: [PATCH 70/89] [RISCV] Move TRUNCATE_VECTOR_VL combine into a helper
 function. NFC (#93574)

I plan to add other combines on TRUNCATE_VECTOR_VL.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 103 ++++++++++----------
 1 file changed, 53 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c826892c1668ec..5fc613c1b2a140 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16087,6 +16087,57 @@ static bool matchIndexAsWiderOp(EVT VT, SDValue Index, SDValue Mask,
   return true;
 }
 
+static SDValue combineTruncOfSraSext(SDNode *N, SelectionDAG &DAG) {
+  // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1))
+  // This would be benefit for the cases where X and Y are both the same value
+  // type of low precision vectors. Since the truncate would be lowered into
+  // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate
+  // restriction, such pattern would be expanded into a series of "vsetvli"
+  // and "vnsrl" instructions later to reach this point.
+  auto IsTruncNode = [](SDValue V) {
+    if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
+      return false;
+    SDValue VL = V.getOperand(2);
+    auto *C = dyn_cast<ConstantSDNode>(VL);
+    // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand
+    bool IsVLMAXForVMSET = (C && C->isAllOnes()) ||
+                           (isa<RegisterSDNode>(VL) &&
+                            cast<RegisterSDNode>(VL)->getReg() == RISCV::X0);
+    return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL && IsVLMAXForVMSET;
+  };
+
+  SDValue Op = N->getOperand(0);
+
+  // We need to first find the inner level of TRUNCATE_VECTOR_VL node
+  // to distinguish such pattern.
+  while (IsTruncNode(Op)) {
+    if (!Op.hasOneUse())
+      return SDValue();
+    Op = Op.getOperand(0);
+  }
+
+  if (Op.getOpcode() != ISD::SRA || !Op.hasOneUse())
+    return SDValue();
+
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  if (N0.getOpcode() != ISD::SIGN_EXTEND || !N0.hasOneUse() ||
+      N1.getOpcode() != ISD::ZERO_EXTEND || !N1.hasOneUse())
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N10 = N1.getOperand(0);
+  if (!N00.getValueType().isVector() ||
+      N00.getValueType() != N10.getValueType() ||
+      N->getValueType(0) != N10.getValueType())
+    return SDValue();
+
+  unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1;
+  SDValue SMin =
+      DAG.getNode(ISD::SMIN, SDLoc(N1), N->getValueType(0), N10,
+                  DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0)));
+  return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin);
+}
 
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
@@ -16304,56 +16355,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
     return SDValue();
-  case RISCVISD::TRUNCATE_VECTOR_VL: {
-    // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1))
-    // This would be benefit for the cases where X and Y are both the same value
-    // type of low precision vectors. Since the truncate would be lowered into
-    // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate
-    // restriction, such pattern would be expanded into a series of "vsetvli"
-    // and "vnsrl" instructions later to reach this point.
-    auto IsTruncNode = [](SDValue V) {
-      if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
-        return false;
-      SDValue VL = V.getOperand(2);
-      auto *C = dyn_cast<ConstantSDNode>(VL);
-      // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand
-      bool IsVLMAXForVMSET = (C && C->isAllOnes()) ||
-                             (isa<RegisterSDNode>(VL) &&
-                              cast<RegisterSDNode>(VL)->getReg() == RISCV::X0);
-      return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL &&
-             IsVLMAXForVMSET;
-    };
-
-    SDValue Op = N->getOperand(0);
-
-    // We need to first find the inner level of TRUNCATE_VECTOR_VL node
-    // to distinguish such pattern.
-    while (IsTruncNode(Op)) {
-      if (!Op.hasOneUse())
-        return SDValue();
-      Op = Op.getOperand(0);
-    }
-
-    if (Op.getOpcode() == ISD::SRA && Op.hasOneUse()) {
-      SDValue N0 = Op.getOperand(0);
-      SDValue N1 = Op.getOperand(1);
-      if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
-          N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse()) {
-        SDValue N00 = N0.getOperand(0);
-        SDValue N10 = N1.getOperand(0);
-        if (N00.getValueType().isVector() &&
-            N00.getValueType() == N10.getValueType() &&
-            N->getValueType(0) == N10.getValueType()) {
-          unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1;
-          SDValue SMin = DAG.getNode(
-              ISD::SMIN, SDLoc(N1), N->getValueType(0), N10,
-              DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0)));
-          return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin);
-        }
-      }
-    }
-    break;
-  }
+  case RISCVISD::TRUNCATE_VECTOR_VL:
+    return combineTruncOfSraSext(N, DAG);
   case ISD::TRUNCATE:
     return performTRUNCATECombine(N, DAG, Subtarget);
   case ISD::SELECT:

From 00bd2fa1982f3114536323209fffad909463effc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 28 May 2024 14:57:13 -0700
Subject: [PATCH 71/89] [flang][cuda] Add bind c to cudadevice procedures
 (#92822)

This patch adds bind c names to functions and subroutines in cudadevice
so they can be lowered and not hit the intrinsic procedure TODOs.
---
 flang/module/cudadevice.f90                | 16 +++++-----
 flang/test/Lower/CUDA/cuda-device-proc.cuf | 36 ++++++++++++++++++++++
 2 files changed, 44 insertions(+), 8 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-device-proc.cuf

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index f34820dd10792a..0224ecfdde7c60 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -18,34 +18,34 @@ module cudadevice
   ! Synchronization Functions
 
   interface
-    attributes(device) subroutine syncthreads()
+    attributes(device) subroutine syncthreads() bind(c, name='__syncthreads')
     end subroutine
   end interface
   public :: syncthreads
 
   interface
-    attributes(device) integer function syncthreads_and(value)
+    attributes(device) integer function syncthreads_and(value) bind(c, name='__syncthreads_and')
       integer :: value
     end function
   end interface
   public :: syncthreads_and
 
   interface
-    attributes(device) integer function syncthreads_count(value)
+    attributes(device) integer function syncthreads_count(value) bind(c, name='__syncthreads_count')
       integer :: value
     end function
   end interface
   public :: syncthreads_count
 
   interface
-    attributes(device) integer function syncthreads_or(value)
+    attributes(device) integer function syncthreads_or(value) bind(c, name='__syncthreads_or')
       integer :: value
     end function
   end interface
   public :: syncthreads_or
 
   interface
-    attributes(device) subroutine syncwarp(mask)
+    attributes(device) subroutine syncwarp(mask) bind(c, name='__syncwarp')
       integer :: mask
     end subroutine
   end interface
@@ -54,19 +54,19 @@ attributes(device) subroutine syncwarp(mask)
   ! Memory Fences
 
   interface
-    attributes(device) subroutine threadfence()
+    attributes(device) subroutine threadfence() bind(c, name='__threadfence')
     end subroutine
   end interface
   public :: threadfence
 
   interface
-    attributes(device) subroutine threadfence_block()
+    attributes(device) subroutine threadfence_block() bind(c, name='__threadfence_block')
     end subroutine
   end interface
   public :: threadfence_block
 
   interface
-    attributes(device) subroutine threadfence_system()
+    attributes(device) subroutine threadfence_system() bind(c, name='__threadfence_system')
     end subroutine
   end interface
   public :: threadfence_system
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
new file mode 100644
index 00000000000000..0c71ea6efcd632
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -0,0 +1,36 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran procedures available in cudadevice module
+
+attributes(global) subroutine devsub()
+  implicit none
+  integer :: ret
+
+  call syncthreads()
+  call syncwarp(1)
+  call threadfence()
+  call threadfence_block()
+  call threadfence_system()
+  ret = syncthreads_and(1)
+  ret = syncthreads_count(1)
+  ret = syncthreads_or(1)
+end
+
+! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: fir.call @__syncthreads()
+! CHECK: fir.call @__syncwarp(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> ()
+! CHECK: fir.call @__threadfence()
+! CHECK: fir.call @__threadfence_block()
+! CHECK: fir.call @__threadfence_system()
+! CHECK: %{{.*}} = fir.call @__syncthreads_and(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> i32
+! CHECK: %{{.*}} = fir.call @__syncthreads_count(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> i32
+! CHECK: %{{.*}} = fir.call @__syncthreads_or(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> i32
+
+! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads"}
+! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp"}
+! CHECK: func.func private @__threadfence() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence"}
+! CHECK: func.func private @__threadfence_block() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_block"}
+! CHECK: func.func private @__threadfence_system() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_system"}
+! CHECK: func.func private @__syncthreads_and(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_and"}
+! CHECK: func.func private @__syncthreads_count(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_count"}
+! CHECK: func.func private @__syncthreads_or(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_or"}

From 2d00c6fe06b6d709b4ab3d6b253df304c04e0c1f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 15:05:23 -0700
Subject: [PATCH 72/89] [RISCV] Add a rematerializable pseudo instruction for
 LUI+ADDI for global addresses. (#93352)

This allows register allocation to rematerialize these instead of
spilling and reloading. We need to make it a single instruction due to
limitations in rematerialization.

This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA
scheduling.

This improves the dynamic instruction count on 531.deepsjeng_r from
spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a
1% improvement. There are couple regressions, but they are 0.1% or
smaller.

AArch64 has similar pseudo instructions like MOVaddr
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |  20 ++
 .../lib/Target/RISCV/RISCVMergeBaseOffset.cpp |  35 ++-
 .../RISCV/RISCVPostRAExpandPseudoInsts.cpp    |  23 ++
 llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll    |  22 +-
 .../CodeGen/RISCV/ctz_zero_return_test.ll     |   8 +-
 .../early-clobber-tied-def-subreg-liveness.ll |  14 +-
 .../test/CodeGen/RISCV/fold-addi-loadstore.ll |   4 +-
 llvm/test/CodeGen/RISCV/rv32xtheadbb.ll       |   4 +-
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |   4 +-
 .../CodeGen/RISCV/rvv/active_lane_mask.ll     |  40 +--
 .../CodeGen/RISCV/rvv/fixed-vectors-int.ll    |   4 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 275 +++++++++---------
 .../RISCV/rvv/fixed-vectors-mask-buildvec.ll  |  20 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  |  16 +-
 .../rvv/fixed-vectors-shuffle-reverse.ll      |  80 ++---
 .../RISCV/rvv/fixed-vectors-stepvector.ll     |  10 +-
 .../test/CodeGen/RISCV/rvv/shuffle-reverse.ll |  50 ++--
 llvm/test/CodeGen/RISCV/tail-calls.ll         |   8 +-
 llvm/test/CodeGen/RISCV/unroll-loop-cse.ll    |  32 +-
 19 files changed, 358 insertions(+), 311 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ce50fe6e2cbb02..a1b078910e29c9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1311,6 +1311,26 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12),
 
 /// HI and ADD_LO address nodes.
 
+// Pseudo for a rematerializable LUI+ADDI sequence for loading an address.
+// It will be expanded after register allocation.
+// FIXME: The scheduling information does not reflect the multiple instructions.
+let Size = 8, isReMaterializable = 1 in
+def PseudoMovAddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>,
+                    Sched<[WriteIALU]>;
+
+def riscv_hi_oneuse : unop_oneuse<riscv_hi>;
+def addr_hi_lo : PatFrag<(ops node:$hi, node:$lo),
+                         (riscv_add_lo (riscv_hi_oneuse node:$hi), node:$lo)>;
+
+def : Pat<(addr_hi_lo tglobaladdr:$hi, tglobaladdr:$lo),
+          (PseudoMovAddr tglobaladdr:$hi, tglobaladdr:$lo)>;
+def : Pat<(addr_hi_lo tblockaddress:$hi, tblockaddress:$lo),
+          (PseudoMovAddr tblockaddress:$hi, tblockaddress:$lo)>;
+def : Pat<(addr_hi_lo tjumptable:$hi, tjumptable:$lo),
+          (PseudoMovAddr tjumptable:$hi, tjumptable:$lo)>;
+def : Pat<(addr_hi_lo tconstpool:$hi, tconstpool:$lo),
+          (PseudoMovAddr tconstpool:$hi, tconstpool:$lo)>;
+
 def : Pat<(riscv_hi tglobaladdr:$in), (LUI tglobaladdr:$in)>;
 def : Pat<(riscv_hi tblockaddress:$in), (LUI tblockaddress:$in)>;
 def : Pat<(riscv_hi tjumptable:$in), (LUI tjumptable:$in)>;
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 410989177a8b9c..fecc83a821f420 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -84,7 +84,8 @@ INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE,
 //    3) The offset value in the Global Address or Constant Pool is 0.
 bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
                                              MachineInstr *&Lo) {
-  if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC)
+  if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC &&
+      Hi.getOpcode() != RISCV::PseudoMovAddr)
     return false;
 
   const MachineOperand &HiOp1 = Hi.getOperand(1);
@@ -97,16 +98,22 @@ bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
       HiOp1.getOffset() != 0)
     return false;
 
-  Register HiDestReg = Hi.getOperand(0).getReg();
-  if (!MRI->hasOneUse(HiDestReg))
-    return false;
+  if (Hi.getOpcode() == RISCV::PseudoMovAddr) {
+    // Most of the code should handle it correctly without modification by
+    // setting Lo and Hi both point to PseudoMovAddr
+    Lo = &Hi;
+  } else {
+    Register HiDestReg = Hi.getOperand(0).getReg();
+    if (!MRI->hasOneUse(HiDestReg))
+      return false;
 
-  Lo = &*MRI->use_instr_begin(HiDestReg);
-  if (Lo->getOpcode() != RISCV::ADDI)
-    return false;
+    Lo = &*MRI->use_instr_begin(HiDestReg);
+    if (Lo->getOpcode() != RISCV::ADDI)
+      return false;
+  }
 
   const MachineOperand &LoOp2 = Lo->getOperand(2);
-  if (Hi.getOpcode() == RISCV::LUI) {
+  if (Hi.getOpcode() == RISCV::LUI || Hi.getOpcode() == RISCV::PseudoMovAddr) {
     if (LoOp2.getTargetFlags() != RISCVII::MO_LO ||
         !(LoOp2.isGlobal() || LoOp2.isCPI() || LoOp2.isBlockAddress()) ||
         LoOp2.getOffset() != 0)
@@ -466,6 +473,13 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
 
   Hi.getOperand(1).setOffset(NewOffset);
   MachineOperand &ImmOp = Lo.getOperand(2);
+  // Expand PseudoMovAddr into LUI
+  if (Hi.getOpcode() == RISCV::PseudoMovAddr) {
+    auto *TII = ST->getInstrInfo();
+    Hi.setDesc(TII->get(RISCV::LUI));
+    Hi.removeOperand(2);
+  }
+
   if (Hi.getOpcode() != RISCV::AUIPC)
     ImmOp.setOffset(NewOffset);
 
@@ -501,6 +515,11 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
     }
   }
 
+  // Prevent Lo (originally PseudoMovAddr, which is also pointed by Hi) from
+  // being erased
+  if (&Lo == &Hi)
+    return true;
+
   MRI->replaceRegWith(Lo.getOperand(0).getReg(), Hi.getOperand(0).getReg());
   Lo.eraseFromParent();
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index 52f2ce27164d6e..b7b0c47c084c64 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -44,6 +44,7 @@ class RISCVPostRAExpandPseudo : public MachineFunctionPass {
   bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandMovImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMovAddr(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
 };
 
 char RISCVPostRAExpandPseudo::ID = 0;
@@ -75,6 +76,8 @@ bool RISCVPostRAExpandPseudo::expandMI(MachineBasicBlock &MBB,
   switch (MBBI->getOpcode()) {
   case RISCV::PseudoMovImm:
     return expandMovImm(MBB, MBBI);
+  case RISCV::PseudoMovAddr:
+    return expandMovAddr(MBB, MBBI);
   default:
     return false;
   }
@@ -101,6 +104,26 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB,
   return true;
 }
 
+bool RISCVPostRAExpandPseudo::expandMovAddr(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI) {
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  Register DstReg = MBBI->getOperand(0).getReg();
+  bool DstIsDead = MBBI->getOperand(0).isDead();
+  bool Renamable = MBBI->getOperand(0).isRenamable();
+
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::LUI))
+      .addReg(DstReg, RegState::Define | getRenamableRegState(Renamable))
+      .add(MBBI->getOperand(1));
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI))
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead) |
+                          getRenamableRegState(Renamable))
+      .addReg(DstReg, RegState::Kill | getRenamableRegState(Renamable))
+      .add(MBBI->getOperand(2));
+  MBBI->eraseFromParent();
+  return true;
+}
+
 } // end of anonymous namespace
 
 INITIALIZE_PASS(RISCVPostRAExpandPseudo, "riscv-expand-pseudolisimm32",
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 549d531e829ea5..a90c244437a033 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -383,8 +383,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
@@ -442,9 +442,9 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32M-LABEL: test_cttz_i64:
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    lui a2, 30667
-; RV32M-NEXT:    addi a2, a2, 1329
-; RV32M-NEXT:    lui a3, %hi(.LCPI3_0)
-; RV32M-NEXT:    addi a3, a3, %lo(.LCPI3_0)
+; RV32M-NEXT:    addi a3, a2, 1329
+; RV32M-NEXT:    lui a2, %hi(.LCPI3_0)
+; RV32M-NEXT:    addi a2, a2, %lo(.LCPI3_0)
 ; RV32M-NEXT:    bnez a1, .LBB3_3
 ; RV32M-NEXT:  # %bb.1:
 ; RV32M-NEXT:    li a1, 32
@@ -452,18 +452,18 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32M-NEXT:  .LBB3_2:
 ; RV32M-NEXT:    neg a1, a0
 ; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    mul a0, a0, a2
+; RV32M-NEXT:    mul a0, a0, a3
 ; RV32M-NEXT:    srli a0, a0, 27
-; RV32M-NEXT:    add a0, a3, a0
+; RV32M-NEXT:    add a0, a2, a0
 ; RV32M-NEXT:    lbu a0, 0(a0)
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
 ; RV32M-NEXT:  .LBB3_3:
 ; RV32M-NEXT:    neg a4, a1
 ; RV32M-NEXT:    and a1, a1, a4
-; RV32M-NEXT:    mul a1, a1, a2
+; RV32M-NEXT:    mul a1, a1, a3
 ; RV32M-NEXT:    srli a1, a1, 27
-; RV32M-NEXT:    add a1, a3, a1
+; RV32M-NEXT:    add a1, a2, a1
 ; RV32M-NEXT:    lbu a1, 0(a1)
 ; RV32M-NEXT:    bnez a0, .LBB3_2
 ; RV32M-NEXT:  .LBB3_4:
@@ -814,8 +814,8 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI7_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI7_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI7_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI7_0)
 ; RV32I-NEXT:    neg a0, s1
 ; RV32I-NEXT:    and a0, s1, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index 9ae30e646fdbf7..fe6e20d852d590 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -48,8 +48,8 @@ define signext i32 @ctz_dereferencing_pointer(ptr %b) nounwind {
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI0_0)
-; RV32I-NEXT:    addi s3, a0, %lo(.LCPI0_0)
+; RV32I-NEXT:    lui s3, %hi(.LCPI0_0)
+; RV32I-NEXT:    addi s3, s3, %lo(.LCPI0_0)
 ; RV32I-NEXT:    neg a0, s4
 ; RV32I-NEXT:    and a0, s4, a0
 ; RV32I-NEXT:    mv a1, s1
@@ -511,8 +511,8 @@ define signext i32 @ctz4(i64 %b) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI6_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI6_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI6_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI6_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index eb6ac985287a10..478d2eae9dca2c 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -24,31 +24,31 @@ define void @_Z3foov() {
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_48)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_48)
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_46)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_46)
-; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_45)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_45)
-; CHECK-NEXT:    vle16.v v14, (a0)
+; CHECK-NEXT:    vle16.v v12, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v12, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v14, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vs2r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_40)
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 3c2e84689c979c..62b1549a5d58ad 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -389,8 +389,8 @@ define dso_local i32 @load_ga() local_unnamed_addr #0 {
 define dso_local i64 @load_ga_8() nounwind {
 ; RV32I-LABEL: load_ga_8:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lui a0, %hi(ga_8)
-; RV32I-NEXT:    addi a1, a0, %lo(ga_8)
+; RV32I-NEXT:    lui a1, %hi(ga_8)
+; RV32I-NEXT:    addi a1, a1, %lo(ga_8)
 ; RV32I-NEXT:    lw a0, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index b45ab135fa1c7c..197366e7e05fe8 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -209,8 +209,8 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 7e6c3f9c87d277..f25aa0de89da88 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -199,8 +199,8 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 9cb3991f31f94d..08b310213d16e1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -126,28 +126,28 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_0)
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v24, v16
+; CHECK-NEXT:    vsaddu.vx v16, v24, a1
+; CHECK-NEXT:    vmsltu.vx v9, v16, a2
 ; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_2)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vmsltu.vx v10, v16, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 2
+; CHECK-NEXT:    vslideup.vi v0, v9, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v10, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v9
+; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v8, v16, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
@@ -169,13 +169,13 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v10, v16, a2
+; CHECK-NEXT:    vmsltu.vx v8, v16, a2
 ; CHECK-NEXT:    vsext.vf8 v16, v9
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_2)
 ; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vmsltu.vx v10, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_3)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_3)
 ; CHECK-NEXT:    vle8.v v11, (a0)
@@ -187,10 +187,10 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vmsltu.vx v11, v16, a2
 ; CHECK-NEXT:    vid.v v16
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v0, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_4)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_4)
 ; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_5)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_5)
 ; CHECK-NEXT:    vle8.v v13, (a0)
@@ -201,27 +201,27 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
 ; CHECK-NEXT:    vmsltu.vx v13, v16, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    vslideup.vi v10, v9, 4
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_6)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_6)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 6
+; CHECK-NEXT:    vslideup.vi v10, v11, 6
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v12, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v13, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v9
+; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
+; CHECK-NEXT:    vmsltu.vx v8, v16, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 6
+; CHECK-NEXT:    vslideup.vi v0, v8, 6
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 8
+; CHECK-NEXT:    vslideup.vi v0, v10, 8
 ; CHECK-NEXT:    ret
   %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
   ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 79c36a629465d9..f4d7074c7f6b27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -3459,6 +3459,8 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV64-NEXT:    lui a1, %hi(.LCPI184_0)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI184_0)
 ; RV64-NEXT:    vle64.v v10, (a1)
+; RV64-NEXT:    vmulhu.vv v10, v8, v10
+; RV64-NEXT:    vsub.vv v8, v8, v10
 ; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    slli a1, a1, 63
 ; RV64-NEXT:    vmv.s.x v12, a1
@@ -3466,8 +3468,6 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV64-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; RV64-NEXT:    vslideup.vi v14, v12, 2
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmulhu.vv v10, v8, v10
-; RV64-NEXT:    vsub.vv v8, v8, v10
 ; RV64-NEXT:    vmulhu.vv v8, v8, v14
 ; RV64-NEXT:    vadd.vv v8, v8, v10
 ; RV64-NEXT:    lui a1, 12320
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 178a920169ad96..bc3e135a588a6f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -159,17 +159,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 82
+; RV32-NEXT:    li a3, 80
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 82 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb
 ; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v16, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 57
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 6
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -177,26 +176,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vslideup.vi v8, v16, 4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 41
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    vmv.s.x v1, a4
+; RV32-NEXT:    vmv.s.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 6
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    li a5, 56
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vmv1r.v v3, v0
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v8, v16, 10, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 45
+; RV32-NEXT:    li a5, 44
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
@@ -206,8 +205,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 5
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    slli a4, a4, 5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
@@ -216,21 +214,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a5, 1
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 25
+; RV32-NEXT:    li a6, 24
 ; RV32-NEXT:    mul a4, a4, a6
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 73
+; RV32-NEXT:    li a4, 72
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -238,27 +236,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a5, -64
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -266,259 +263,257 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v12, v8, 2
+; RV32-NEXT:    vmv1r.v v8, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v1, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vs1r.v v3, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v12, v16, 8, v0.t
-; RV32-NEXT:    vmv.v.v v20, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_3)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_3)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_4)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    vle16.v v16, (a3)
-; RV32-NEXT:    addi a1, a4, %lo(.LCPI6_4)
+; RV32-NEXT:    vle16.v v0, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v2, (a1)
+; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v24, v8, v4
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v4, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v24
+; RV32-NEXT:    vmv.v.v v12, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v2
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vrgatherei16.vv v12, v24, v10
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v16, v8, 6, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v12, v24, 6, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vle16.v v4, (a3)
-; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 960
+; RV32-NEXT:    vmv.s.x v8, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v0, v12
+; RV32-NEXT:    vmv1r.v v3, v8
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
+; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_8)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_8)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    addi a1, a4, %lo(.LCPI6_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_9)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v24, (a3)
-; RV32-NEXT:    vle16.v v28, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v4, v0, v8
+; RV32-NEXT:    vrgatherei16.vv v12, v24, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v4, v8, 4, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv4r.v v24, v16
+; RV32-NEXT:    vslideup.vi v12, v16, 4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v3, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v16, 6
+; RV32-NEXT:    vslideup.vi v8, v16, 6
 ; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v12, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    vle16.v v12, (a3)
+; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 1008
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 2
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_13)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_13)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_14)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_14)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_15)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a1)
-; RV32-NEXT:    addi a1, a4, %lo(.LCPI6_15)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_15)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_15)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a3)
 ; RV32-NEXT:    vle16.v v8, (a1)
@@ -526,27 +521,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v20, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -554,7 +548,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -562,12 +556,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 49
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -576,31 +570,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 21
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 13
+; RV32-NEXT:    li a2, 12
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 57
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv.v.v v28, v0
 ; RV32-NEXT:    vmv.v.v v16, v8
 ; RV32-NEXT:    addi a1, a0, 320
@@ -614,21 +605,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 45
+; RV32-NEXT:    li a2, 44
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 82
+; RV32-NEXT:    li a1, 80
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
index 17483151869365..7608349ef7aeff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
@@ -549,20 +549,20 @@ define <128 x i1> @buildvec_mask_v128i1() {
 define <128 x i1> @buildvec_mask_optsize_v128i1() optsize {
 ; CHECK-LABEL: buildvec_mask_optsize_v128i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI21_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI21_0)
-; CHECK-NEXT:    li a1, 128
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    lui a1, %hi(.LCPI21_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI21_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_optsize_v128i1:
 ; ZVE32F:       # %bb.0:
-; ZVE32F-NEXT:    lui a0, %hi(.LCPI21_0)
-; ZVE32F-NEXT:    addi a0, a0, %lo(.LCPI21_0)
-; ZVE32F-NEXT:    li a1, 128
-; ZVE32F-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; ZVE32F-NEXT:    vlm.v v0, (a0)
+; ZVE32F-NEXT:    li a0, 128
+; ZVE32F-NEXT:    lui a1, %hi(.LCPI21_0)
+; ZVE32F-NEXT:    addi a1, a1, %lo(.LCPI21_0)
+; ZVE32F-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; ZVE32F-NEXT:    vlm.v v0, (a1)
 ; ZVE32F-NEXT:    ret
   ret <128 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index db0969c85a8e24..69341981288b91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -13327,22 +13327,22 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ; RV32-LABEL: mgather_shuffle_vrgather:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, %hi(.LCPI119_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI119_0)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vle16.v v9, (a0)
-; RV32-NEXT:    lui a0, %hi(.LCPI119_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI119_0)
+; RV32-NEXT:    vle16.v v9, (a1)
 ; RV32-NEXT:    vle16.v v10, (a0)
-; RV32-NEXT:    vrgather.vv v8, v9, v10
+; RV32-NEXT:    vrgather.vv v8, v10, v9
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_shuffle_vrgather:
 ; RV64V:       # %bb.0:
+; RV64V-NEXT:    lui a1, %hi(.LCPI119_0)
+; RV64V-NEXT:    addi a1, a1, %lo(.LCPI119_0)
 ; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vle16.v v9, (a0)
-; RV64V-NEXT:    lui a0, %hi(.LCPI119_0)
-; RV64V-NEXT:    addi a0, a0, %lo(.LCPI119_0)
+; RV64V-NEXT:    vle16.v v9, (a1)
 ; RV64V-NEXT:    vle16.v v10, (a0)
-; RV64V-NEXT:    vrgather.vv v8, v9, v10
+; RV64V-NEXT:    vrgather.vv v8, v10, v9
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_shuffle_vrgather:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index d70ed2fb0e2665..4b1f0beb487008 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -228,11 +228,11 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) {
 define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: reverse_v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI12_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI12_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI12_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -243,11 +243,11 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 define <64 x i8> @reverse_v64i8(<64 x i8> %a) {
 ; CHECK-LABEL: reverse_v64i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI13_0)
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    lui a1, %hi(.LCPI13_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI13_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v16, (a1)
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -323,11 +323,11 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) {
 define <32 x i16> @reverse_v32i16(<32 x i16> %a) {
 ; CHECK-LABEL: reverse_v32i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI19_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI19_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI19_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vsext.vf2 v16, v12
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
@@ -520,11 +520,11 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) {
 define <32 x half> @reverse_v32f16(<32 x half> %a) {
 ; CHECK-LABEL: reverse_v32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI34_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI34_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vsext.vf2 v16, v12
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
@@ -820,33 +820,33 @@ define <6 x i64> @reverse_v6i64(<6 x i64> %a) {
 define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
 ; RV32-BITS-UNKNOWN-LABEL: reverse_v12i64:
 ; RV32-BITS-UNKNOWN:       # %bb.0:
-; RV32-BITS-UNKNOWN-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-BITS-UNKNOWN-NEXT:    li a1, 32
-; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vle16.v v24, (a0)
+; RV32-BITS-UNKNOWN-NEXT:    li a0, 32
+; RV32-BITS-UNKNOWN-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-BITS-UNKNOWN-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-BITS-UNKNOWN-NEXT:    vle16.v v24, (a1)
 ; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-BITS-UNKNOWN-NEXT:    vmv.v.v v8, v16
 ; RV32-BITS-UNKNOWN-NEXT:    ret
 ;
 ; RV32-BITS-256-LABEL: reverse_v12i64:
 ; RV32-BITS-256:       # %bb.0:
-; RV32-BITS-256-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-BITS-256-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-BITS-256-NEXT:    li a1, 32
-; RV32-BITS-256-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-BITS-256-NEXT:    vle16.v v24, (a0)
+; RV32-BITS-256-NEXT:    li a0, 32
+; RV32-BITS-256-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-BITS-256-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-BITS-256-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-BITS-256-NEXT:    vle16.v v24, (a1)
 ; RV32-BITS-256-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-BITS-256-NEXT:    vmv.v.v v8, v16
 ; RV32-BITS-256-NEXT:    ret
 ;
 ; RV32-BITS-512-LABEL: reverse_v12i64:
 ; RV32-BITS-512:       # %bb.0:
-; RV32-BITS-512-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-BITS-512-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-BITS-512-NEXT:    li a1, 32
-; RV32-BITS-512-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-BITS-512-NEXT:    vle16.v v24, (a0)
+; RV32-BITS-512-NEXT:    li a0, 32
+; RV32-BITS-512-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-BITS-512-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-BITS-512-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-BITS-512-NEXT:    vle16.v v24, (a1)
 ; RV32-BITS-512-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-BITS-512-NEXT:    vmv.v.v v8, v16
 ; RV32-BITS-512-NEXT:    ret
@@ -883,11 +883,11 @@ define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
 ;
 ; RV32-ZVBB-LABEL: reverse_v12i64:
 ; RV32-ZVBB:       # %bb.0:
-; RV32-ZVBB-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-ZVBB-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-ZVBB-NEXT:    li a1, 32
-; RV32-ZVBB-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-ZVBB-NEXT:    vle16.v v24, (a0)
+; RV32-ZVBB-NEXT:    li a0, 32
+; RV32-ZVBB-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-ZVBB-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-ZVBB-NEXT:    vle16.v v24, (a1)
 ; RV32-ZVBB-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-ZVBB-NEXT:    vmv.v.v v8, v16
 ; RV32-ZVBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
index 0161ac4bc338db..e2580c132f65e9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
@@ -225,11 +225,11 @@ declare <16 x i64> @llvm.experimental.stepvector.v16i64()
 define <16 x i64> @stepvector_v16i64() {
 ; RV32-LABEL: stepvector_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI16_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI16_0)
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vle8.v v16, (a0)
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    lui a1, %hi(.LCPI16_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_0)
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vle8.v v16, (a1)
 ; RV32-NEXT:    vsext.vf4 v8, v16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
index 6e327457bebffc..368f454fa5fda1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
@@ -106,11 +106,11 @@ define <16 x i8> @v16i8(<16 x i8> %a) {
 define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v16i8_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI7_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI7_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI7_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vmv1r.v v14, v9
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vid.v v8
@@ -230,11 +230,11 @@ define <16 x i16> @v16i16(<16 x i16> %a) {
 define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: v16i16_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI15_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI15_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI15_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI15_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle16.v v16, (a1)
 ; CHECK-NEXT:    vmv2r.v v20, v10
 ; CHECK-NEXT:    vmv2r.v v12, v8
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16
@@ -363,11 +363,11 @@ define <16 x i32> @v16i32(<16 x i32> %a) {
 define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: v16i32_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI23_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle16.v v20, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI23_0)
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vle16.v v20, (a1)
 ; CHECK-NEXT:    vmv4r.v v24, v12
 ; CHECK-NEXT:    vmv4r.v v16, v8
 ; CHECK-NEXT:    vrgatherei16.vv v8, v16, v20
@@ -548,11 +548,11 @@ define <16 x half> @v16f16(<16 x half> %a) {
 define <32 x half> @v16f16_2(<16 x half> %a) {
 ; CHECK-LABEL: v16f16_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI35_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI35_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle16.v v16, (a1)
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -719,11 +719,11 @@ define <8 x double> @v4f64_2(<4 x double> %a, <4 x double> %b) {
 define <32 x i8> @v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI46_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index 87d69bfad38c2b..d3e495bb723ad8 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -56,12 +56,12 @@ define void @caller_indirect_tail(i32 %a) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    beqz a0, .LBB3_2
 ; CHECK-NEXT:  # %bb.1: # %entry
-; CHECK-NEXT:    lui a0, %hi(callee_indirect2)
-; CHECK-NEXT:    addi t1, a0, %lo(callee_indirect2)
+; CHECK-NEXT:    lui t1, %hi(callee_indirect2)
+; CHECK-NEXT:    addi t1, t1, %lo(callee_indirect2)
 ; CHECK-NEXT:    jr t1
 ; CHECK-NEXT:  .LBB3_2:
-; CHECK-NEXT:    lui a0, %hi(callee_indirect1)
-; CHECK-NEXT:    addi t1, a0, %lo(callee_indirect1)
+; CHECK-NEXT:    lui t1, %hi(callee_indirect1)
+; CHECK-NEXT:    addi t1, t1, %lo(callee_indirect1)
 ; CHECK-NEXT:    jr t1
 
 
diff --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
index 2fd4572d234567..65307363048376 100644
--- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
+++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
@@ -10,36 +10,30 @@
 define signext i32 @unroll_loop_cse() {
 ; CHECK-LABEL: unroll_loop_cse:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(x)
-; CHECK-NEXT:    lw a3, %lo(x)(a1)
-; CHECK-NEXT:    lui a2, %hi(check)
-; CHECK-NEXT:    lw a4, %lo(check)(a2)
+; CHECK-NEXT:    lui a0, %hi(x)
+; CHECK-NEXT:    lw a1, %lo(x)(a0)
+; CHECK-NEXT:    lui a0, %hi(check)
+; CHECK-NEXT:    lw a2, %lo(check)(a0)
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    bne a3, a4, .LBB0_6
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    addi a1, a1, %lo(x)
-; CHECK-NEXT:    lw a1, 4(a1)
-; CHECK-NEXT:    addi a2, a2, %lo(check)
-; CHECK-NEXT:    lw a2, 4(a2)
 ; CHECK-NEXT:    bne a1, a2, .LBB0_6
-; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a1, %hi(x)
 ; CHECK-NEXT:    addi a1, a1, %lo(x)
-; CHECK-NEXT:    lw a3, 8(a1)
+; CHECK-NEXT:    lw a3, 4(a1)
 ; CHECK-NEXT:    lui a2, %hi(check)
 ; CHECK-NEXT:    addi a2, a2, %lo(check)
+; CHECK-NEXT:    lw a4, 4(a2)
+; CHECK-NEXT:    bne a3, a4, .LBB0_6
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    lw a3, 8(a1)
 ; CHECK-NEXT:    lw a4, 8(a2)
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    lw a1, 12(a1)
-; CHECK-NEXT:    lw a2, 12(a2)
-; CHECK-NEXT:    bne a1, a2, .LBB0_6
+; CHECK-NEXT:    lw a3, 12(a1)
+; CHECK-NEXT:    lw a4, 12(a2)
+; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.4:
-; CHECK-NEXT:    lui a1, %hi(x)
-; CHECK-NEXT:    addi a1, a1, %lo(x)
 ; CHECK-NEXT:    lw a3, 16(a1)
-; CHECK-NEXT:    lui a2, %hi(check)
-; CHECK-NEXT:    addi a2, a2, %lo(check)
 ; CHECK-NEXT:    lw a4, 16(a2)
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.5:

From 765206e050453018e861637a08a4520f29238074 Mon Sep 17 00:00:00 2001
From: gulfemsavrun <gulfem@google.com>
Date: Tue, 28 May 2024 15:06:11 -0700
Subject: [PATCH 73/89] [CodeGen] Hidden visibility for prof version var
 (#93496)

This patch adds hidden visibility to the variable
that is used by the single byte counters mode in
source-based code coverage.
---
 clang/lib/CodeGen/CodeGenPGO.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 76704c4d7be4a4..db8e6f55302adc 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1340,7 +1340,7 @@ void CodeGenPGO::setProfileVersion(llvm::Module &M) {
                                         llvm::APInt(64, ProfileVersion)),
         VarName);
 
-    IRLevelVersionVariable->setVisibility(llvm::GlobalValue::DefaultVisibility);
+    IRLevelVersionVariable->setVisibility(llvm::GlobalValue::HiddenVisibility);
     llvm::Triple TT(M.getTargetTriple());
     if (TT.supportsCOMDAT()) {
       IRLevelVersionVariable->setLinkage(llvm::GlobalValue::ExternalLinkage);

From 067b4ccb4b5ab93ac2dc2243248a8934fa1f7ce3 Mon Sep 17 00:00:00 2001
From: Eric <eric@efcs.ca>
Date: Tue, 28 May 2024 15:19:04 -0700
Subject: [PATCH 74/89] Upstream libc++ buildbot restarter. (#93582)

I've been running a cronjob on my local machine to restart preempted
libc++ CI runs. This is bad and brittle. This upstreams a much better
version of the restarter.

It works by matching on check run annotations looking for mention
of the machine being shutdown.

If there are both preempted jobs and failing jobs, we don't restart
the workflow. Maybe we should change that?
---
 .../restart-preempted-libcxx-jobs.yaml        | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 .github/workflows/restart-preempted-libcxx-jobs.yaml

diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml
new file mode 100644
index 00000000000000..a71f2084182e5e
--- /dev/null
+++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml
@@ -0,0 +1,109 @@
+name: Restart Preempted Libc++ Workflow
+
+# The libc++ builders run on preemptable VMs, which can be shutdown at any time.
+# This workflow identifies when a workflow run was canceled due to the VM being preempted,
+# and restarts the workflow run.
+
+# We identify a canceled workflow run by checking the annotations of the check runs in the check suite,
+# which should contain the message "The runner has received a shutdown signal."
+
+# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow.
+
+on:
+  workflow_run:
+    workflows:
+      - "Build and Test libc\+\+"
+    types:
+      - failure
+      - canceled
+
+permissions:
+  contents: read
+
+jobs:
+  restart:
+    if: github.repository_owner == 'llvm'
+    name: "Restart Job"
+    permissions:
+      statuses: read
+      checks: read
+      actions: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Restart Job"
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+        with:
+          script: |
+            const failure_regex = /Process completed with exit code 1./
+            const preemption_regex = /The runner has received a shutdown signal/
+            
+            console.log('Listing check runs for suite')
+            const check_suites = await github.rest.checks.listForSuite({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              check_suite_id: context.payload.workflow_run.check_suite_id
+            })
+
+            check_run_ids = [];
+            for (check_run of check_suites.data.check_runs) {
+              console.log('Checking check run: ' + check_run.id);
+              console.log(check_run);
+              if (check_run.status != 'completed') {
+                console.log('Check run was not completed. Skipping.');
+                continue;
+              }
+              if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
+                console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
+                continue;
+              }
+              check_run_ids.push(check_run.id);
+            }
+            
+            has_preempted_job = false;
+
+            for (check_run_id of check_run_ids) {
+              console.log('Listing annotations for check run: ' + check_run_id);
+                 
+              annotations = await github.rest.checks.listAnnotations({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                check_run_id: check_run_id
+              })
+              
+              console.log(annotations);
+              for (annotation of annotations.data) {
+                if (annotation.annotation_level != 'failure') {
+                  continue;
+                }
+                
+                const preemption_match = annotation.message.match(preemption_regex);
+              
+                if (preemption_match != null) {
+                  console.log('Found preemption message: ' + annotation.message);
+                  has_preempted_job = true;
+                }
+                
+                const failure_match = annotation.message.match(failure_regex);
+                if (failure_match != null) {
+                  // We only want to restart the workflow if all of the failures were due to preemption.
+                  // We don't want to restart the workflow if there were other failures.
+                  console.log('Choosing not to rerun workflow because we found a non-preemption failure');
+                  console.log('Failure message: ' + annotation.message);
+                  return;
+                }
+              }
+            } 
+             
+            if (!has_preempted_job) {
+              console.log('No preempted jobs found. Not restarting workflow.');
+              return;
+            }
+            
+            console.log("Restarted workflow: " + context.payload.workflow_run.id);
+            await github.rest.actions.reRunWorkflowFailedJobs({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                run_id: context.payload.workflow_run.id
+              })
+            
+        

From b9cdea66b62e2eb91814ef7c57ea01aa27440e72 Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric@efcs.ca>
Date: Tue, 28 May 2024 18:23:14 -0400
Subject: [PATCH 75/89] Attempt to fix issue with plus sign in libc++ workflow
 name

---
 .github/workflows/restart-preempted-libcxx-jobs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml
index a71f2084182e5e..5682b0a4f52c3d 100644
--- a/.github/workflows/restart-preempted-libcxx-jobs.yaml
+++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml
@@ -12,7 +12,7 @@ name: Restart Preempted Libc++ Workflow
 on:
   workflow_run:
     workflows:
-      - "Build and Test libc\+\+"
+      - Build and Test libc\+\+
     types:
       - failure
       - canceled

From 6aeea700df6f3f8db9e6a79be4aa593c6fcc7d18 Mon Sep 17 00:00:00 2001
From: Spenser Bauman <sbauman@mathworks.com>
Date: Tue, 28 May 2024 18:29:17 -0400
Subject: [PATCH 76/89] [mlir][dataflow] Fix for integer range analysis
 propagation bug (#93199)

Integer range analysis will not update the range of an operation when
any of the inferred input lattices are uninitialized. In the current
behavior, all lattice values for non integer types are uninitialized.

For operations like arith.cmpf

```mlir
%3 = arith.cmpf ugt, %arg0, %arg1 : f32
```

that will result in the range of the output also being uninitialized,
and so on for any consumer of the arith.cmpf result. When control-flow
ops are involved, the lack of propagation results in incorrect ranges,
as the back edges for loop carried values are not properly joined with
the definitions from the body region.

For example, an scf.while loop whose body region produces a value that
is in a dataflow relationship with some floating-point values through an
arith.cmpf operation:

```mlir
func.func @test_bad_range(%arg0: f32, %arg1: f32) -> (index, index) {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index

  %3 = arith.cmpf ugt, %arg0, %arg1 : f32

  %1:2 = scf.while (%arg2 = %c0, %arg3 = %c0) : (index, index) -> (index, index) {
    %2 = arith.cmpi ult, %arg2, %c4 : index
    scf.condition(%2) %arg2, %arg3 : index, index
  } do {
  ^bb0(%arg2: index, %arg3: index):
    %4 = arith.select %3, %arg3, %arg3 : index
    %5 = arith.addi %arg2, %c1 : index
    scf.yield %5, %4 : index, index
  }

  return %1#0, %1#1 : index, index
}
```

The existing behavior results in the control condition %2 being
optimized to true, turning the while loop into an infinite loop. The
update to %arg2 through the body region is never factored into the range
calculation, as the ranges for the body ops all test as uninitialized.

This change causes all values initialized with setToEntryState to be set
to some initialized range, even if the values are not integers.

---------

Co-authored-by: Spenser Bauman <sabauma@fastmail>
---
 .../Analysis/DataFlow/IntegerRangeAnalysis.h  | 45 -----------
 .../include/mlir/Dialect/Arith/IR/ArithOps.td | 16 ++--
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 12 +--
 .../include/mlir/Dialect/Index/IR/IndexOps.td |  2 +-
 .../mlir/Interfaces/InferIntRangeInterface.h  | 75 ++++++++++++++++++-
 .../mlir/Interfaces/InferIntRangeInterface.td | 46 +++++++++---
 .../Interfaces/Utils/InferIntRangeCommon.h    |  8 +-
 .../DataFlow/IntegerRangeAnalysis.cpp         | 51 ++++---------
 .../Arith/IR/InferIntRangeInterfaceImpls.cpp  | 18 +++--
 .../lib/Interfaces/InferIntRangeInterface.cpp | 48 ++++++++++++
 .../Interfaces/Utils/InferIntRangeCommon.cpp  |  2 +-
 .../Dialect/Arith/int-range-interface.mlir    | 19 +++++
 mlir/test/lib/Dialect/Test/TestOps.td         |  9 ++-
 13 files changed, 230 insertions(+), 121 deletions(-)

diff --git a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h
index 8bd7cf880c6afb..191c023fb642cb 100644
--- a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h
@@ -24,51 +24,6 @@
 namespace mlir {
 namespace dataflow {
 
-/// This lattice value represents the integer range of an SSA value.
-class IntegerValueRange {
-public:
-  /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)])
-  /// range that is used to mark the value as unable to be analyzed further,
-  /// where `t` is the type of `value`.
-  static IntegerValueRange getMaxRange(Value value);
-
-  /// Create an integer value range lattice value.
-  IntegerValueRange(std::optional<ConstantIntRanges> value = std::nullopt)
-      : value(std::move(value)) {}
-
-  /// Whether the range is uninitialized. This happens when the state hasn't
-  /// been set during the analysis.
-  bool isUninitialized() const { return !value.has_value(); }
-
-  /// Get the known integer value range.
-  const ConstantIntRanges &getValue() const {
-    assert(!isUninitialized());
-    return *value;
-  }
-
-  /// Compare two ranges.
-  bool operator==(const IntegerValueRange &rhs) const {
-    return value == rhs.value;
-  }
-
-  /// Take the union of two ranges.
-  static IntegerValueRange join(const IntegerValueRange &lhs,
-                                const IntegerValueRange &rhs) {
-    if (lhs.isUninitialized())
-      return rhs;
-    if (rhs.isUninitialized())
-      return lhs;
-    return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())};
-  }
-
-  /// Print the integer value range.
-  void print(raw_ostream &os) const { os << value; }
-
-private:
-  /// The known integer value range.
-  std::optional<ConstantIntRanges> value;
-};
-
 /// This lattice element represents the integer value range of an SSA value.
 /// When this lattice is updated, it automatically updates the constant value
 /// of the SSA value (if the range can be narrowed to one).
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index ead52332e8eec3..46248dad3be9e0 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -49,7 +49,7 @@ class Arith_BinaryOp<string mnemonic, list<Trait> traits = []> :
 // Base class for integer binary operations.
 class Arith_IntBinaryOp<string mnemonic, list<Trait> traits = []> :
     Arith_BinaryOp<mnemonic, traits #
-      [DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs)>,
     Results<(outs SignlessIntegerLike:$result)>;
 
@@ -107,7 +107,7 @@ class Arith_IToICastOp<string mnemonic, list<Trait> traits = []> :
     Arith_CastOp<mnemonic, SignlessFixedWidthIntegerLike,
                            SignlessFixedWidthIntegerLike,
                            traits #
-                           [DeclareOpInterfaceMethods<InferIntRangeInterface>]>;
+                           [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>;
 // Cast from an integer type to a floating point type.
 class Arith_IToFCastOp<string mnemonic, list<Trait> traits = []> :
     Arith_CastOp<mnemonic, SignlessFixedWidthIntegerLike, FloatLike, traits>;
@@ -139,7 +139,7 @@ class Arith_CompareOpOfAnyRank<string mnemonic, list<Trait> traits = []> :
 
 class Arith_IntBinaryOpWithOverflowFlags<string mnemonic, list<Trait> traits = []> :
     Arith_BinaryOp<mnemonic, traits #
-      [Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>,
+      [Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
        DeclareOpInterfaceMethods<ArithIntegerOverflowFlagsInterface>]>,
     Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs,
       DefaultValuedAttr<
@@ -159,7 +159,7 @@ def Arith_ConstantOp : Op<Arith_Dialect, "constant",
     [ConstantLike, Pure,
      DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
      AllTypesMatch<["value", "result"]>,
-     DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+     DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "integer or floating point constant";
   let description = [{
     The `constant` operation produces an SSA value equal to some integer or
@@ -1327,7 +1327,7 @@ def IndexCastTypeConstraint : TypeConstraint<Or<[
 
 def Arith_IndexCastOp
   : Arith_CastOp<"index_cast", IndexCastTypeConstraint, IndexCastTypeConstraint,
-                 [DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+                 [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "cast between index and integer types";
   let description = [{
     Casts between scalar or vector integers and corresponding 'index' scalar or
@@ -1346,7 +1346,7 @@ def Arith_IndexCastOp
 
 def Arith_IndexCastUIOp
   : Arith_CastOp<"index_castui", IndexCastTypeConstraint, IndexCastTypeConstraint,
-                 [DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+                 [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "unsigned cast between index and integer types";
   let description = [{
     Casts between scalar or vector integers and corresponding 'index' scalar or
@@ -1400,7 +1400,7 @@ def Arith_BitcastOp : Arith_CastOp<"bitcast", BitcastTypeConstraint,
 
 def Arith_CmpIOp
   : Arith_CompareOpOfAnyRank<"cmpi",
-                             [DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+                             [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "integer comparison operation";
   let description = [{
     The `cmpi` operation is a generic comparison for integer-like types. Its two
@@ -1555,7 +1555,7 @@ class ScalarConditionOrMatchingShape<list<string> names> :
 def SelectOp : Arith_Op<"select", [Pure,
     AllTypesMatch<["true_value", "false_value", "result"]>,
     ScalarConditionOrMatchingShape<["condition", "result"]>,
-    DeclareOpInterfaceMethods<InferIntRangeInterface>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRangesFromOptional"]>,
   ] # ElementwiseMappable.traits> {
   let summary = "select operation";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 1da68ed2176d8f..10719aae5c8b46 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -52,7 +52,7 @@ def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
 class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
     GPU_Op<mnemonic, !listconcat(traits, [
         Pure,
-        DeclareOpInterfaceMethods<InferIntRangeInterface>,
+        DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
         DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>,
     Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> {
   let assemblyFormat = "$dimension attr-dict";
@@ -144,7 +144,7 @@ def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
 }
 
 def GPU_LaneIdOp : GPU_Op<"lane_id", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let description = [{
     Returns the lane id within the subgroup (warp/wave).
 
@@ -158,7 +158,7 @@ def GPU_LaneIdOp : GPU_Op<"lane_id", [
 }
 
 def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
   let description = [{
     Returns the subgroup id, i.e., the index of the current subgroup within the
@@ -190,7 +190,7 @@ def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
 
 
 def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
   let description = [{
     Returns the number of subgroups within a workgroup.
@@ -206,7 +206,7 @@ def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
 }
 
 def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
   let description = [{
     Returns the number of threads within a subgroup.
@@ -687,7 +687,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
 
 def GPU_LaunchOp : GPU_Op<"launch", [
       AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface,
-      DeclareOpInterfaceMethods<InferIntRangeInterface>,
+      DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
       RecursiveMemoryEffects]>,
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
diff --git a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td
index c6079cb8a98c81..a30ae9f739cbc6 100644
--- a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td
+++ b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td
@@ -25,7 +25,7 @@ include "mlir/IR/OpBase.td"
 /// Base class for Index dialect operations.
 class IndexOp<string mnemonic, list<Trait> traits = []>
     : Op<IndexDialect, mnemonic,
-      [DeclareOpInterfaceMethods<InferIntRangeInterface>] # traits>;
+      [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] # traits>;
 
 //===----------------------------------------------------------------------===//
 // IndexBinaryOp
diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
index 05064a72ef02e7..0e107e88f5232f 100644
--- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
+++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
@@ -105,10 +105,83 @@ class ConstantIntRanges {
 
 raw_ostream &operator<<(raw_ostream &, const ConstantIntRanges &);
 
+/// This lattice value represents the integer range of an SSA value.
+class IntegerValueRange {
+public:
+  /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)])
+  /// range that is used to mark the value as unable to be analyzed further,
+  /// where `t` is the type of `value`.
+  static IntegerValueRange getMaxRange(Value value);
+
+  /// Create an integer value range lattice value.
+  IntegerValueRange(ConstantIntRanges value) : value(std::move(value)) {}
+
+  /// Create an integer value range lattice value.
+  IntegerValueRange(std::optional<ConstantIntRanges> value = std::nullopt)
+      : value(std::move(value)) {}
+
+  /// Whether the range is uninitialized. This happens when the state hasn't
+  /// been set during the analysis.
+  bool isUninitialized() const { return !value.has_value(); }
+
+  /// Get the known integer value range.
+  const ConstantIntRanges &getValue() const {
+    assert(!isUninitialized());
+    return *value;
+  }
+
+  /// Compare two ranges.
+  bool operator==(const IntegerValueRange &rhs) const {
+    return value == rhs.value;
+  }
+
+  /// Compute the least upper bound of two ranges.
+  static IntegerValueRange join(const IntegerValueRange &lhs,
+                                const IntegerValueRange &rhs) {
+    if (lhs.isUninitialized())
+      return rhs;
+    if (rhs.isUninitialized())
+      return lhs;
+    return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())};
+  }
+
+  /// Print the integer value range.
+  void print(raw_ostream &os) const { os << value; }
+
+private:
+  /// The known integer value range.
+  std::optional<ConstantIntRanges> value;
+};
+
+raw_ostream &operator<<(raw_ostream &, const IntegerValueRange &);
+
 /// The type of the `setResultRanges` callback provided to ops implementing
 /// InferIntRangeInterface. It should be called once for each integer result
 /// value and be passed the ConstantIntRanges corresponding to that value.
-using SetIntRangeFn = function_ref<void(Value, const ConstantIntRanges &)>;
+using SetIntRangeFn =
+    llvm::function_ref<void(Value, const ConstantIntRanges &)>;
+
+/// Similar to SetIntRangeFn, but operating on IntegerValueRange lattice values.
+/// This is the `setResultRanges` callback for the IntegerValueRange based
+/// interface method.
+using SetIntLatticeFn =
+    llvm::function_ref<void(Value, const IntegerValueRange &)>;
+
+class InferIntRangeInterface;
+
+namespace intrange::detail {
+/// Default implementation of `inferResultRanges` which dispatches to the
+/// `inferResultRangesFromOptional`.
+void defaultInferResultRanges(InferIntRangeInterface interface,
+                              ArrayRef<IntegerValueRange> argRanges,
+                              SetIntLatticeFn setResultRanges);
+
+/// Default implementation of `inferResultRangesFromOptional` which dispatches
+/// to the `inferResultRanges`.
+void defaultInferResultRangesFromOptional(InferIntRangeInterface interface,
+                                          ArrayRef<ConstantIntRanges> argRanges,
+                                          SetIntRangeFn setResultRanges);
+} // end namespace intrange::detail
 } // end namespace mlir
 
 #include "mlir/Interfaces/InferIntRangeInterface.h.inc"
diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td
index dbdc526c6f10b6..6ee436ce4d6c2f 100644
--- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td
+++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td
@@ -28,9 +28,10 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> {
       Infer the bounds on the results of this op given the bounds on its arguments.
       For each result value or block argument (that isn't a branch argument,
       since the dataflow analysis handles those case), the method should call
-      `setValueRange` with that `Value` as an argument. When `setValueRange`
-      is not called for some value, it will recieve a default value of the mimimum
-      and maximum values for its type (the unbounded range).
+      `setValueRange` with that `Value` as an argument. When implemented,
+      `setValueRange` should be called on all result values for the operation.
+      When operations take non-integer inputs, the
+     `inferResultRangesFromOptional` method should be implemented instead.
 
       When called on an op that also implements the RegionBranchOpInterface
       or BranchOpInterface, this method should not attempt to infer the values
@@ -39,14 +40,39 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> {
 
       This function will only be called when at least one result of the op is a
       scalar integer value or the op has a region.
+    }],
+    /*retTy=*/"void",
+    /*methodName=*/"inferResultRanges",
+    /*args=*/(ins "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges,
+                  "::mlir::SetIntRangeFn":$setResultRanges),
+    /*methodBody=*/"",
+    /*defaultImplementation=*/[{
+      ::mlir::intrange::detail::defaultInferResultRangesFromOptional($_op,
+                                                                     argRanges,
+                                                                     setResultRanges);
+    }]>,
+
+    InterfaceMethod<[{
+      Infer the bounds on the results of this op given the lattice representation
+      of the bounds for its arguments. For each result value or block argument
+      (that isn't a branch argument, since the dataflow analysis handles
+      those case), the method should call `setValueRange` with that `Value`
+      as an argument. When implemented, `setValueRange` should be called on
+      all result values for the operation.
 
-      `argRanges` contains one `IntRangeAttrs` for each argument to the op in ODS
-       order. Non-integer arguments will have the an unbounded range of width-0
-       APInts in their `argRanges` element.
+      This method allows for more precise implementations when operations
+      want to reason about inputs which may be undefined during the analysis.
     }],
-    "void", "inferResultRanges", (ins
-      "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges,
-      "::mlir::SetIntRangeFn":$setResultRanges)
-  >];
+    /*retTy=*/"void",
+    /*methodName=*/"inferResultRangesFromOptional",
+    /*args=*/(ins "::llvm::ArrayRef<::mlir::IntegerValueRange>":$argRanges,
+                  "::mlir::SetIntLatticeFn":$setResultRanges),
+    /*methodBody=*/"",
+    /*defaultImplementation=*/[{
+      ::mlir::intrange::detail::defaultInferResultRanges($_op,
+                                                         argRanges,
+                                                         setResultRanges);
+    }]>
+  ];
 }
 #endif // MLIR_INTERFACES_INFERINTRANGEINTERFACE
diff --git a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
index 851bb534bc7ee1..3988a8826498a9 100644
--- a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
+++ b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
@@ -25,7 +25,11 @@ namespace intrange {
 /// abstracted away here to permit writing the function that handles both
 /// 64- and 32-bit index types.
 using InferRangeFn =
-    function_ref<ConstantIntRanges(ArrayRef<ConstantIntRanges>)>;
+    std::function<ConstantIntRanges(ArrayRef<ConstantIntRanges>)>;
+
+/// Function that performs inferrence on an array of `IntegerValueRange`.
+using InferIntegerValueRangeFn =
+    std::function<IntegerValueRange(ArrayRef<IntegerValueRange>)>;
 
 static constexpr unsigned indexMinWidth = 32;
 static constexpr unsigned indexMaxWidth = 64;
@@ -52,7 +56,7 @@ using InferRangeWithOvfFlagsFn =
 ///
 /// The `mode` argument specifies if the unsigned, signed, or both results of
 /// the inference computation should be used when comparing the results.
-ConstantIntRanges inferIndexOp(InferRangeFn inferFn,
+ConstantIntRanges inferIndexOp(const InferRangeFn &inferFn,
                                ArrayRef<ConstantIntRanges> argRanges,
                                CmpMode mode);
 
diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
index a82c30717e275b..9721620807a0f0 100644
--- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
@@ -36,17 +36,6 @@
 using namespace mlir;
 using namespace mlir::dataflow;
 
-IntegerValueRange IntegerValueRange::getMaxRange(Value value) {
-  unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType());
-  if (width == 0)
-    return {};
-  APInt umin = APInt::getMinValue(width);
-  APInt umax = APInt::getMaxValue(width);
-  APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin;
-  APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax;
-  return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}};
-}
-
 void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const {
   Lattice::onUpdate(solver);
 
@@ -72,24 +61,17 @@ void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const {
 void IntegerRangeAnalysis::visitOperation(
     Operation *op, ArrayRef<const IntegerValueRangeLattice *> operands,
     ArrayRef<IntegerValueRangeLattice *> results) {
-  // If the lattice on any operand is unitialized, bail out.
-  if (llvm::any_of(operands, [](const IntegerValueRangeLattice *lattice) {
-        return lattice->getValue().isUninitialized();
-      })) {
-    return;
-  }
-
   auto inferrable = dyn_cast<InferIntRangeInterface>(op);
   if (!inferrable)
     return setAllToEntryStates(results);
 
   LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n");
-  SmallVector<ConstantIntRanges> argRanges(
-      llvm::map_range(operands, [](const IntegerValueRangeLattice *val) {
-        return val->getValue().getValue();
-      }));
+  auto argRanges = llvm::map_to_vector(
+      operands, [](const IntegerValueRangeLattice *lattice) {
+        return lattice->getValue();
+      });
 
-  auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) {
+  auto joinCallback = [&](Value v, const IntegerValueRange &attrs) {
     auto result = dyn_cast<OpResult>(v);
     if (!result)
       return;
@@ -99,7 +81,7 @@ void IntegerRangeAnalysis::visitOperation(
     IntegerValueRangeLattice *lattice = results[result.getResultNumber()];
     IntegerValueRange oldRange = lattice->getValue();
 
-    ChangeResult changed = lattice->join(IntegerValueRange{attrs});
+    ChangeResult changed = lattice->join(attrs);
 
     // Catch loop results with loop variant bounds and conservatively make
     // them [-inf, inf] so we don't circle around infinitely often (because
@@ -116,7 +98,7 @@ void IntegerRangeAnalysis::visitOperation(
     propagateIfChanged(lattice, changed);
   };
 
-  inferrable.inferResultRanges(argRanges, joinCallback);
+  inferrable.inferResultRangesFromOptional(argRanges, joinCallback);
 }
 
 void IntegerRangeAnalysis::visitNonControlFlowArguments(
@@ -124,17 +106,12 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
     ArrayRef<IntegerValueRangeLattice *> argLattices, unsigned firstIndex) {
   if (auto inferrable = dyn_cast<InferIntRangeInterface>(op)) {
     LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n");
-    // If the lattice on any operand is unitialized, bail out.
-    if (llvm::any_of(op->getOperands(), [&](Value value) {
-          return getLatticeElementFor(op, value)->getValue().isUninitialized();
-        }))
-      return;
-    SmallVector<ConstantIntRanges> argRanges(
-        llvm::map_range(op->getOperands(), [&](Value value) {
-          return getLatticeElementFor(op, value)->getValue().getValue();
-        }));
 
-    auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) {
+    auto argRanges = llvm::map_to_vector(op->getOperands(), [&](Value value) {
+      return getLatticeElementFor(op, value)->getValue();
+    });
+
+    auto joinCallback = [&](Value v, const IntegerValueRange &attrs) {
       auto arg = dyn_cast<BlockArgument>(v);
       if (!arg)
         return;
@@ -145,7 +122,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
       IntegerValueRangeLattice *lattice = argLattices[arg.getArgNumber()];
       IntegerValueRange oldRange = lattice->getValue();
 
-      ChangeResult changed = lattice->join(IntegerValueRange{attrs});
+      ChangeResult changed = lattice->join(attrs);
 
       // Catch loop results with loop variant bounds and conservatively make
       // them [-inf, inf] so we don't circle around infinitely often (because
@@ -162,7 +139,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
       propagateIfChanged(lattice, changed);
     };
 
-    inferrable.inferResultRanges(argRanges, joinCallback);
+    inferrable.inferResultRangesFromOptional(argRanges, joinCallback);
     return;
   }
 
diff --git a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
index fbe2ecab8adcaa..462044417b5fb8 100644
--- a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
@@ -295,18 +295,24 @@ void arith::CmpIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 // SelectOp
 //===----------------------------------------------------------------------===//
 
-void arith::SelectOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
-                                        SetIntRangeFn setResultRange) {
-  std::optional<APInt> mbCondVal = argRanges[0].getConstantValue();
+void arith::SelectOp::inferResultRangesFromOptional(
+    ArrayRef<IntegerValueRange> argRanges, SetIntLatticeFn setResultRange) {
+  std::optional<APInt> mbCondVal =
+      argRanges[0].isUninitialized()
+          ? std::nullopt
+          : argRanges[0].getValue().getConstantValue();
+
+  const IntegerValueRange &trueCase = argRanges[1];
+  const IntegerValueRange &falseCase = argRanges[2];
 
   if (mbCondVal) {
     if (mbCondVal->isZero())
-      setResultRange(getResult(), argRanges[2]);
+      setResultRange(getResult(), falseCase);
     else
-      setResultRange(getResult(), argRanges[1]);
+      setResultRange(getResult(), trueCase);
     return;
   }
-  setResultRange(getResult(), argRanges[1].rangeUnion(argRanges[2]));
+  setResultRange(getResult(), IntegerValueRange::join(trueCase, falseCase));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Interfaces/InferIntRangeInterface.cpp b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
index b3f6c0ee3cc32d..d879b93586899b 100644
--- a/mlir/lib/Interfaces/InferIntRangeInterface.cpp
+++ b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
@@ -126,3 +126,51 @@ raw_ostream &mlir::operator<<(raw_ostream &os, const ConstantIntRanges &range) {
   return os << "unsigned : [" << range.umin() << ", " << range.umax()
             << "] signed : [" << range.smin() << ", " << range.smax() << "]";
 }
+
+IntegerValueRange IntegerValueRange::getMaxRange(Value value) {
+  unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType());
+  if (width == 0)
+    return {};
+
+  APInt umin = APInt::getMinValue(width);
+  APInt umax = APInt::getMaxValue(width);
+  APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin;
+  APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax;
+  return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}};
+}
+
+raw_ostream &mlir::operator<<(raw_ostream &os, const IntegerValueRange &range) {
+  range.print(os);
+  return os;
+}
+
+void mlir::intrange::detail::defaultInferResultRanges(
+    InferIntRangeInterface interface, ArrayRef<IntegerValueRange> argRanges,
+    SetIntLatticeFn setResultRanges) {
+  llvm::SmallVector<ConstantIntRanges> unpacked;
+  unpacked.reserve(argRanges.size());
+
+  for (const IntegerValueRange &range : argRanges) {
+    if (range.isUninitialized())
+      return;
+    unpacked.push_back(range.getValue());
+  }
+
+  interface.inferResultRanges(
+      unpacked,
+      [&setResultRanges](Value value, const ConstantIntRanges &argRanges) {
+        setResultRanges(value, IntegerValueRange{argRanges});
+      });
+}
+
+void mlir::intrange::detail::defaultInferResultRangesFromOptional(
+    InferIntRangeInterface interface, ArrayRef<ConstantIntRanges> argRanges,
+    SetIntRangeFn setResultRanges) {
+  auto ranges = llvm::to_vector_of<IntegerValueRange>(argRanges);
+  interface.inferResultRangesFromOptional(
+      ranges,
+      [&setResultRanges](Value value, const IntegerValueRange &argRanges) {
+        if (!argRanges.isUninitialized())
+          setResultRanges(value, argRanges.getValue());
+      });
+}
diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
index fe1a67d6287386..5b8d35e7bd5197 100644
--- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
+++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
@@ -76,7 +76,7 @@ static ConstantIntRanges minMaxBy(ConstArithFn op, ArrayRef<APInt> lhs,
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferIndexOp(InferRangeFn inferFn,
+mlir::intrange::inferIndexOp(const InferRangeFn &inferFn,
                              ArrayRef<ConstantIntRanges> argRanges,
                              intrange::CmpMode mode) {
   ConstantIntRanges sixtyFour = inferFn(argRanges);
diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir
index 5b538197a0c117..60f0ab41afa48d 100644
--- a/mlir/test/Dialect/Arith/int-range-interface.mlir
+++ b/mlir/test/Dialect/Arith/int-range-interface.mlir
@@ -899,3 +899,22 @@ func.func @test_shl_i8_nowrap() -> i8 {
   %2 = test.reflect_bounds %1 : i8
   return %2: i8
 }
+
+/// A test case to ensure that the ranges for unsupported ops are initialized
+/// properly to maxRange, rather than left uninitialized.
+/// In this test case, the previous behavior would leave the ranges for %a and
+/// %b uninitialized, resulting in arith.cmpf's range not being updated, even
+/// though it has an integer valued result.
+
+// CHECK-LABEL: func @test_cmpf_propagates
+// CHECK: test.reflect_bounds {smax = 2 : index, smin = 1 : index, umax = 2 : index, umin = 1 : index}
+func.func @test_cmpf_propagates(%a: f32, %b: f32) -> index {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %0 = arith.cmpf ueq, %a, %b : f32
+  %1 = arith.select %0, %c1, %c2 : index
+  %2 = test.reflect_bounds %1 : index
+  func.return %2 : index
+}
+
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 18324482153a54..9d7e0a7928ab8d 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2750,7 +2750,7 @@ def TestGraphLoopOp : TEST_Op<"graph_loop",
 def InferIntRangeType : AnyTypeOf<[AnyInteger, Index]>;
 
 def TestWithBoundsOp : TEST_Op<"with_bounds",
-                          [DeclareOpInterfaceMethods<InferIntRangeInterface>,
+                          [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
                            NoMemoryEffect]> {
   let arguments = (ins APIntAttr:$umin,
                        APIntAttr:$umax,
@@ -2762,7 +2762,7 @@ def TestWithBoundsOp : TEST_Op<"with_bounds",
 }
 
 def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region",
-                          [DeclareOpInterfaceMethods<InferIntRangeInterface>,
+                          [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
                            SingleBlock, NoTerminator]> {
   let arguments = (ins APIntAttr:$umin,
                        APIntAttr:$umax,
@@ -2774,7 +2774,7 @@ def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region",
 }
 
 def TestIncrementOp : TEST_Op<"increment",
-                         [DeclareOpInterfaceMethods<InferIntRangeInterface>,
+                         [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
                          NoMemoryEffect, AllTypesMatch<["value", "result"]>]> {
   let arguments = (ins InferIntRangeType:$value);
   let results = (outs InferIntRangeType:$result);
@@ -2783,7 +2783,8 @@ def TestIncrementOp : TEST_Op<"increment",
 }
 
 def TestReflectBoundsOp : TEST_Op<"reflect_bounds",
-                         [DeclareOpInterfaceMethods<InferIntRangeInterface>, AllTypesMatch<["value", "result"]>]> {
+                         [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
+                          AllTypesMatch<["value", "result"]>]> {
   let arguments = (ins InferIntRangeType:$value,
                        OptionalAttr<APIntAttr>:$umin,
                        OptionalAttr<APIntAttr>:$umax,

From 20d497c26fc95c80a1bacb38820d92e5f52bec58 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 28 May 2024 15:33:59 -0700
Subject: [PATCH 77/89] [Driver] Remove unneeded *-linux-gnu after D158183

Recommit 435ea21c897f94b5a3777a9f152e4c5bb4a371a3.
As the comment added by a07727199db0525e9d2df41e466a2a1611b3c8e1
suggests, these `*Triples` lists should shrink over time.

https://reviews.llvm.org/D158183 allows *-unknown-linux-gnu to detect
*-linux-gnu. If we additionally allow x86_64-unknown-linux-gnu
-m32/-mx32 to detect x86_64-linux-gnu, we can mostly remove these
*-linux-gnu elements.

Retain x86_64-linux-gnu for now to work around #93609.
(In addition, Debian /usr/bin/clang --version uses x86_64-pc-linux-gnu).
Retain i586-linux-gnu for now to work around #93502.
---
 clang/lib/Driver/ToolChains/Gnu.cpp | 69 ++++++++++++++---------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9849c59685cca7..b141e5f2adfab1 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -2227,10 +2227,19 @@ void Generic_GCC::GCCInstallationDetector::init(
   SmallVector<StringRef, 16> CandidateBiarchTripleAliases;
   // Add some triples that we want to check first.
   CandidateTripleAliases.push_back(TargetTriple.str());
-  std::string TripleNoVendor = TargetTriple.getArchName().str() + "-" +
-                               TargetTriple.getOSAndEnvironmentName().str();
-  if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor)
+  std::string TripleNoVendor, BiarchTripleNoVendor;
+  if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor) {
+    StringRef OSEnv = TargetTriple.getOSAndEnvironmentName();
+    if (TargetTriple.getEnvironment() == llvm::Triple::GNUX32)
+      OSEnv = "linux-gnu";
+    TripleNoVendor = (TargetTriple.getArchName().str() + '-' + OSEnv).str();
     CandidateTripleAliases.push_back(TripleNoVendor);
+    if (BiarchVariantTriple.getArch() != llvm::Triple::UnknownArch) {
+      BiarchTripleNoVendor =
+          (BiarchVariantTriple.getArchName().str() + '-' + OSEnv).str();
+      CandidateBiarchTripleAliases.push_back(BiarchTripleNoVendor);
+    }
+  }
 
   CollectLibDirsAndTriples(TargetTriple, BiarchVariantTriple, CandidateLibDirs,
                            CandidateTripleAliases, CandidateBiarchLibDirs,
@@ -2453,11 +2462,9 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
   // lists should shrink over time. Please don't add more elements to *Triples.
   static const char *const AArch64LibDirs[] = {"/lib64", "/lib"};
   static const char *const AArch64Triples[] = {
-      "aarch64-none-linux-gnu", "aarch64-linux-gnu", "aarch64-redhat-linux",
-      "aarch64-suse-linux"};
+      "aarch64-none-linux-gnu", "aarch64-redhat-linux", "aarch64-suse-linux"};
   static const char *const AArch64beLibDirs[] = {"/lib"};
-  static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu",
-                                                 "aarch64_be-linux-gnu"};
+  static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu"};
 
   static const char *const ARMLibDirs[] = {"/lib"};
   static const char *const ARMTriples[] = {"arm-linux-gnueabi"};
@@ -2482,9 +2489,8 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
       "x86_64-linux-gnu",       "x86_64-unknown-linux-gnu",
       "x86_64-pc-linux-gnu",    "x86_64-redhat-linux6E",
       "x86_64-redhat-linux",    "x86_64-suse-linux",
-      "x86_64-manbo-linux-gnu", "x86_64-linux-gnu",
-      "x86_64-slackware-linux", "x86_64-unknown-linux",
-      "x86_64-amazon-linux"};
+      "x86_64-manbo-linux-gnu", "x86_64-slackware-linux",
+      "x86_64-unknown-linux",   "x86_64-amazon-linux"};
   static const char *const X32Triples[] = {"x86_64-linux-gnux32",
                                            "x86_64-pc-linux-gnux32"};
   static const char *const X32LibDirs[] = {"/libx32", "/lib"};
@@ -2500,26 +2506,24 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
       "loongarch64-linux-gnu", "loongarch64-unknown-linux-gnu"};
 
   static const char *const M68kLibDirs[] = {"/lib"};
-  static const char *const M68kTriples[] = {
-      "m68k-linux-gnu", "m68k-unknown-linux-gnu", "m68k-suse-linux"};
+  static const char *const M68kTriples[] = {"m68k-unknown-linux-gnu",
+                                            "m68k-suse-linux"};
 
   static const char *const MIPSLibDirs[] = {"/libo32", "/lib"};
   static const char *const MIPSTriples[] = {
       "mips-linux-gnu", "mips-mti-linux", "mips-mti-linux-gnu",
       "mips-img-linux-gnu", "mipsisa32r6-linux-gnu"};
   static const char *const MIPSELLibDirs[] = {"/libo32", "/lib"};
-  static const char *const MIPSELTriples[] = {
-      "mipsel-linux-gnu", "mips-img-linux-gnu", "mipsisa32r6el-linux-gnu"};
+  static const char *const MIPSELTriples[] = {"mipsel-linux-gnu",
+                                              "mips-img-linux-gnu"};
 
   static const char *const MIPS64LibDirs[] = {"/lib64", "/lib"};
   static const char *const MIPS64Triples[] = {
-      "mips64-linux-gnu",      "mips-mti-linux-gnu",
-      "mips-img-linux-gnu",    "mips64-linux-gnuabi64",
+      "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64-linux-gnuabi64",
       "mipsisa64r6-linux-gnu", "mipsisa64r6-linux-gnuabi64"};
   static const char *const MIPS64ELLibDirs[] = {"/lib64", "/lib"};
   static const char *const MIPS64ELTriples[] = {
-      "mips64el-linux-gnu",      "mips-mti-linux-gnu",
-      "mips-img-linux-gnu",      "mips64el-linux-gnuabi64",
+      "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64el-linux-gnuabi64",
       "mipsisa64r6el-linux-gnu", "mipsisa64r6el-linux-gnuabi64"};
 
   static const char *const MIPSN32LibDirs[] = {"/lib32"};
@@ -2534,46 +2538,39 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
 
   static const char *const PPCLibDirs[] = {"/lib32", "/lib"};
   static const char *const PPCTriples[] = {
-      "powerpc-linux-gnu", "powerpc-unknown-linux-gnu", "powerpc-linux-gnuspe",
+      "powerpc-unknown-linux-gnu",
       // On 32-bit PowerPC systems running SUSE Linux, gcc is configured as a
       // 64-bit compiler which defaults to "-m32", hence "powerpc64-suse-linux".
       "powerpc64-suse-linux", "powerpc-montavista-linuxspe"};
   static const char *const PPCLELibDirs[] = {"/lib32", "/lib"};
-  static const char *const PPCLETriples[] = {"powerpcle-linux-gnu",
-                                             "powerpcle-unknown-linux-gnu",
+  static const char *const PPCLETriples[] = {"powerpcle-unknown-linux-gnu",
                                              "powerpcle-linux-musl"};
 
   static const char *const PPC64LibDirs[] = {"/lib64", "/lib"};
-  static const char *const PPC64Triples[] = {
-      "powerpc64-linux-gnu", "powerpc64-unknown-linux-gnu",
-      "powerpc64-suse-linux", "ppc64-redhat-linux"};
+  static const char *const PPC64Triples[] = {"powerpc64-unknown-linux-gnu",
+                                             "powerpc64-suse-linux",
+                                             "ppc64-redhat-linux"};
   static const char *const PPC64LELibDirs[] = {"/lib64", "/lib"};
   static const char *const PPC64LETriples[] = {
-      "powerpc64le-linux-gnu", "powerpc64le-unknown-linux-gnu",
-      "powerpc64le-none-linux-gnu", "powerpc64le-suse-linux",
-      "ppc64le-redhat-linux"};
+      "powerpc64le-unknown-linux-gnu", "powerpc64le-none-linux-gnu",
+      "powerpc64le-suse-linux", "ppc64le-redhat-linux"};
 
   static const char *const RISCV32LibDirs[] = {"/lib32", "/lib"};
   static const char *const RISCV32Triples[] = {"riscv32-unknown-linux-gnu",
-                                               "riscv32-linux-gnu",
                                                "riscv32-unknown-elf"};
   static const char *const RISCV64LibDirs[] = {"/lib64", "/lib"};
   static const char *const RISCV64Triples[] = {"riscv64-unknown-linux-gnu",
-                                               "riscv64-linux-gnu",
                                                "riscv64-unknown-elf"};
 
   static const char *const SPARCv8LibDirs[] = {"/lib32", "/lib"};
-  static const char *const SPARCv8Triples[] = {"sparc-linux-gnu",
-                                               "sparcv8-linux-gnu"};
+  static const char *const SPARCv8Triples[] = {"sparcv8-linux-gnu"};
   static const char *const SPARCv9LibDirs[] = {"/lib64", "/lib"};
-  static const char *const SPARCv9Triples[] = {"sparc64-linux-gnu",
-                                               "sparcv9-linux-gnu"};
+  static const char *const SPARCv9Triples[] = {"sparcv9-linux-gnu"};
 
   static const char *const SystemZLibDirs[] = {"/lib64", "/lib"};
   static const char *const SystemZTriples[] = {
-      "s390x-linux-gnu", "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu",
-      "s390x-suse-linux", "s390x-redhat-linux"};
-
+      "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu", "s390x-suse-linux",
+      "s390x-redhat-linux"};
 
   using std::begin;
   using std::end;

From 760c2aa55f0c5f56bed944328b23aa3f2f764346 Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Tue, 28 May 2024 15:37:03 -0700
Subject: [PATCH 78/89] [lld] Support thumb PLTs (#86223)

We are using PLTs for cortex-m33 which only supports thumb. More
specifically, this is for a very restricted use case. There's no MMU so
there's no sharing of virtual addresses between two processes, but this
is fine. The MCU is used for running [chre
nanoapps](https://android.googlesource.com/platform/system/chre/+/HEAD/doc/nanoapp_overview.md)
for android. Each nanoapp is a shared library (but effectively acts as
an executable containing a test suite) that is loaded and run on the MCU
one binary at a time and there's only one process running at a time, so
we ensure that the same text segment cannot be shared by two different
running executables. GNU LD supports thumb PLTs but we want to migrate
to a clang toolchain and use LLD, so thumb PLTs are needed.
---
 lld/ELF/Arch/ARM.cpp                 | 176 +++++++++++++++++++--------
 lld/ELF/Config.h                     |   1 +
 lld/ELF/InputFiles.cpp               |  12 ++
 lld/test/ELF/armv8-thumb-plt-reloc.s | 126 +++++++++++++++++++
 4 files changed, 262 insertions(+), 53 deletions(-)
 create mode 100644 lld/test/ELF/armv8-thumb-plt-reloc.s

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 687f9499009d5e..3e0efe540e1bf1 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -231,36 +231,71 @@ static void writePltHeaderLong(uint8_t *buf) {
 // The default PLT header requires the .got.plt to be within 128 Mb of the
 // .plt in the positive direction.
 void ARM::writePltHeader(uint8_t *buf) const {
-  // Use a similar sequence to that in writePlt(), the difference is the calling
-  // conventions mean we use lr instead of ip. The PLT entry is responsible for
-  // saving lr on the stack, the dynamic loader is responsible for reloading
-  // it.
-  const uint32_t pltData[] = {
-      0xe52de004, // L1: str lr, [sp,#-4]!
-      0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
-      0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
-      0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
-  };
-
-  uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
-  if (!llvm::isUInt<27>(offset)) {
-    // We cannot encode the Offset, use the long form.
-    writePltHeaderLong(buf);
-    return;
+  if (config->armThumbPLTs) {
+    // The instruction sequence for thumb:
+    //
+    // 0: b500          push    {lr}
+    // 2: f8df e008     ldr.w   lr, [pc, #0x8]          @ 0xe <func+0xe>
+    // 6: 44fe          add     lr, pc
+    // 8: f85e ff08     ldr     pc, [lr, #8]!
+    // e:               .word   .got.plt - .plt - 16
+    //
+    // At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from
+    // `pc` in the add instruction and 8 bytes for the `lr` adjustment.
+    //
+    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16;
+    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
+    write16(buf + 0, 0xb500);
+    // Split into two halves to support endianness correctly.
+    write16(buf + 2, 0xf8df);
+    write16(buf + 4, 0xe008);
+    write16(buf + 6, 0x44fe);
+    // Split into two halves to support endianness correctly.
+    write16(buf + 8, 0xf85e);
+    write16(buf + 10, 0xff08);
+    write32(buf + 12, offset);
+
+    memcpy(buf + 16, trapInstr.data(), 4);  // Pad to 32-byte boundary
+    memcpy(buf + 20, trapInstr.data(), 4);
+    memcpy(buf + 24, trapInstr.data(), 4);
+    memcpy(buf + 28, trapInstr.data(), 4);
+  } else {
+    // Use a similar sequence to that in writePlt(), the difference is the
+    // calling conventions mean we use lr instead of ip. The PLT entry is
+    // responsible for saving lr on the stack, the dynamic loader is responsible
+    // for reloading it.
+    const uint32_t pltData[] = {
+        0xe52de004, // L1: str lr, [sp,#-4]!
+        0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
+        0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
+        0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
+    };
+
+    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
+    if (!llvm::isUInt<27>(offset)) {
+      // We cannot encode the Offset, use the long form.
+      writePltHeaderLong(buf);
+      return;
+    }
+    write32(buf + 0, pltData[0]);
+    write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
+    write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
+    write32(buf + 12, pltData[3] | (offset & 0xfff));
+    memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
+    memcpy(buf + 20, trapInstr.data(), 4);
+    memcpy(buf + 24, trapInstr.data(), 4);
+    memcpy(buf + 28, trapInstr.data(), 4);
   }
-  write32(buf + 0, pltData[0]);
-  write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
-  write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
-  write32(buf + 12, pltData[3] | (offset & 0xfff));
-  memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
-  memcpy(buf + 20, trapInstr.data(), 4);
-  memcpy(buf + 24, trapInstr.data(), 4);
-  memcpy(buf + 28, trapInstr.data(), 4);
 }
 
 void ARM::addPltHeaderSymbols(InputSection &isec) const {
-  addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
-  addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
+  if (config->armThumbPLTs) {
+    addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec);
+  } else {
+    addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
+  }
 }
 
 // Long form PLT entries that do not have any restrictions on the displacement
@@ -279,32 +314,65 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr,
 // .plt in the positive direction.
 void ARM::writePlt(uint8_t *buf, const Symbol &sym,
                    uint64_t pltEntryAddr) const {
-  // The PLT entry is similar to the example given in Appendix A of ELF for
-  // the Arm Architecture. Instead of using the Group Relocations to find the
-  // optimal rotation for the 8-bit immediate used in the add instructions we
-  // hard code the most compact rotations for simplicity. This saves a load
-  // instruction over the long plt sequences.
-  const uint32_t pltData[] = {
-      0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
-      0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
-      0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
-  };
 
-  uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
-  if (!llvm::isUInt<27>(offset)) {
-    // We cannot encode the Offset, use the long form.
-    writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
-    return;
+  if (!config->armThumbPLTs) {
+    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
+
+    // The PLT entry is similar to the example given in Appendix A of ELF for
+    // the Arm Architecture. Instead of using the Group Relocations to find the
+    // optimal rotation for the 8-bit immediate used in the add instructions we
+    // hard code the most compact rotations for simplicity. This saves a load
+    // instruction over the long plt sequences.
+    const uint32_t pltData[] = {
+        0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
+        0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
+        0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
+    };
+    if (!llvm::isUInt<27>(offset)) {
+      // We cannot encode the Offset, use the long form.
+      writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
+      return;
+    }
+    write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
+    write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
+    write32(buf + 8, pltData[2] | (offset & 0xfff));
+    memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
+  } else {
+    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12;
+    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
+
+    // A PLT entry will be:
+    //
+    //       movw ip, #<lower 16 bits>
+    //       movt ip, #<upper 16 bits>
+    //       add ip, pc
+    //   L1: ldr.w pc, [ip]
+    //       b L1
+    //
+    // where ip = r12 = 0xc
+
+    // movw ip, #<lower 16 bits>
+    write16(buf + 2, 0x0c00); // use `ip`
+    relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, offset);
+
+    // movt ip, #<upper 16 bits>
+    write16(buf + 6, 0x0c00); // use `ip`
+    relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, offset);
+
+    write16(buf + 8, 0x44fc);       // add ip, pc
+    write16(buf + 10, 0xf8dc);      // ldr.w   pc, [ip] (bottom half)
+    write16(buf + 12, 0xf000);      // ldr.w   pc, [ip] (upper half)
+    write16(buf + 14, 0xe7fc);      // Branch to previous instruction
   }
-  write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
-  write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
-  write32(buf + 8, pltData[2] | (offset & 0xfff));
-  memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
 }
 
 void ARM::addPltSymbols(InputSection &isec, uint64_t off) const {
-  addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
-  addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
+  if (config->armThumbPLTs) {
+    addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec);
+  } else {
+    addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
+  }
 }
 
 bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
@@ -325,6 +393,8 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
   case R_ARM_JUMP24:
     // Source is ARM, all PLT entries are ARM so no interworking required.
     // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb).
+    assert(!config->armThumbPLTs &&
+           "If the source is ARM, we should not need Thumb PLTs");
     if (s.isFunc() && expr == R_PC && (s.getVA() & 1))
       return true;
     [[fallthrough]];
@@ -335,9 +405,9 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
   }
   case R_ARM_THM_JUMP19:
   case R_ARM_THM_JUMP24:
-    // Source is Thumb, all PLT entries are ARM so interworking is required.
+    // Source is Thumb, when all PLT entries are ARM interworking is required.
     // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM).
-    if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0))
+    if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0))
       return true;
     [[fallthrough]];
   case R_ARM_THM_CALL: {
@@ -547,7 +617,6 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // STT_FUNC we choose whether to write a BL or BLX depending on the
     // value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is
     // not of type STT_FUNC then we must preserve the original instruction.
-    // PLT entries are always ARM state so we know we don't need to interwork.
     assert(rel.sym); // R_ARM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
     bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000;
@@ -606,12 +675,13 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // PLT entries are always ARM state so we know we need to interwork.
     assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
+    bool useThumb = bit0Thumb || config->armThumbPLTs;
     bool isBlx = (read16(loc + 2) & 0x1000) == 0;
     // lld 10.0 and before always used bit0Thumb when deciding to write a BLX
-    // even when type not STT_FUNC. PLT entries generated by LLD are always ARM.
-    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb)
+    // even when type not STT_FUNC.
+    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb)
       stateChangeWarning(loc, rel.type, *rel.sym);
-    if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) {
+    if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) {
       // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As
       // the BLX instruction may only be two byte aligned. This must be done
       // before overflow check.
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index f0dfe7f377de0e..883c4a2f84294c 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -217,6 +217,7 @@ struct Config {
   bool allowMultipleDefinition;
   bool fatLTOObjects;
   bool androidPackDynRelocs = false;
+  bool armThumbPLTs = false;
   bool armHasBlx = false;
   bool armHasMovtMovw = false;
   bool armJ1J2BranchEncoding = false;
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 1f496026d3ae20..d760dddcf5ec5c 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -194,6 +194,18 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) {
   if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base &&
       profile == ARMBuildAttrs::MicroControllerProfile)
     config->armCMSESupport = true;
+
+  // The thumb PLT entries require Thumb2 which can be used on multiple archs.
+  // For now, let's limit it to ones where ARM isn't available and we know have
+  // Thumb2.
+  std::optional<unsigned> armISA =
+      attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use);
+  std::optional<unsigned> thumb =
+      attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use);
+  bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed;
+  bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32;
+  if (noArmISA && hasThumb2)
+    config->armThumbPLTs = true;
 }
 
 InputFile::InputFile(Kind k, MemoryBufferRef m)
diff --git a/lld/test/ELF/armv8-thumb-plt-reloc.s b/lld/test/ELF/armv8-thumb-plt-reloc.s
new file mode 100644
index 00000000000000..47cd5c1b741ee0
--- /dev/null
+++ b/lld/test/ELF/armv8-thumb-plt-reloc.s
@@ -0,0 +1,126 @@
+// REQUIRES: arm
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %s -o %t2
+// RUN: ld.lld %t1 %t2 -o %t
+// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
+// RUN: ld.lld -shared %t1 %t2 -o %t.so
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readelf -S -r %t.so | FileCheck -check-prefix=DSOREL %s
+
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.be
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %s -o %t2.be
+// RUN: ld.lld %t1.be %t2.be -o %t.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
+// RUN: ld.lld -shared %t1.be %t2.be -o %t.so.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
+
+// RUN: ld.lld --be8 %t1.be %t2.be -o %t.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
+// RUN: ld.lld --be8 -shared %t1.be %t2.be -o %t.so.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
+
+/// Test PLT entry generation
+ .text
+ .align 2
+ .globl _start
+ .type  _start,%function
+_start:
+ bl func1
+ bl func2
+ bl func3
+ b.w func1
+ b.w func2
+ b.w func3
+ beq.w func1
+ beq.w func2
+ beq.w func3
+
+/// Executable, expect no PLT
+// CHECK: Disassembly of section .text:
+// CHECK-EMPTY:
+// CHECK-NEXT: <func1>:
+// CHECK-NEXT:   bx      lr
+// CHECK: <func2>:
+// CHECK-NEXT:   bx      lr
+// CHECK: <func3>:
+// CHECK-NEXT:   bx      lr
+// CHECK-NEXT:   d4d4 
+// CHECK: <_start>:
+// CHECK-NEXT:   bl      {{.*}} <func1>
+// CHECK-NEXT:   bl      {{.*}} <func2>
+// CHECK-NEXT:   bl      {{.*}} <func3>
+// CHECK-NEXT:   b.w     {{.*}} <func1>
+// CHECK-NEXT:   b.w     {{.*}} <func2>
+// CHECK-NEXT:   b.w     {{.*}} <func3>
+// CHECK-NEXT:   beq.w	 {{.*}} <func1>
+// CHECK-NEXT:   beq.w	 {{.*}} <func2>
+// CHECK-NEXT:   beq.w	 {{.*}} <func3>
+
+// DSO: Disassembly of section .text:
+// DSO-EMPTY:
+// DSO-NEXT: <func1>:
+// DSO-NEXT:     bx      lr
+// DSO: <func2>:
+// DSO-NEXT:     bx      lr
+// DSO: <func3>:
+// DSO-NEXT:     bx      lr
+// DSO-NEXT:     d4d4 
+// DSO: <_start>:
+/// 0x10260 = PLT func1
+// DSO-NEXT:     bl     0x10260
+/// 0x10270 = PLT func2
+// DSO-NEXT:     bl     0x10270
+/// 0x10280 = PLT func3
+// DSO-NEXT:     bl     0x10280
+/// 0x10260 = PLT func1
+// DSO-NEXT:     b.w    0x10260
+/// 0x10270 = PLT func2
+// DSO-NEXT:     b.w    0x10270
+/// 0x10280 = PLT func3
+// DSO-NEXT:     b.w    0x10280
+/// 0x10260 = PLT func1
+// DSO-NEXT:     beq.w	 0x10260
+/// 0x10270 = PLT func2
+// DSO-NEXT:     beq.w	 0x10270
+/// 0x10280 = PLT func3
+// DSO-NEXT:     beq.w	 0x10280
+// DSO: Disassembly of section .plt:
+// DSO-EMPTY:
+// DSO-NEXT: 10240 <.plt>:
+// DSO-NEXT:     push    {lr}
+// DSO-NEXT:     ldr.w   lr, [pc, #8]
+// DSO-NEXT:     add     lr, pc
+// DSO-NEXT:     ldr     pc, [lr, #8]!
+/// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8
+// DSO-NEXT:     .word   0x00020098
+// DSO-NEXT:     .word   0xd4d4d4d4
+// DSO-NEXT:     .word   0xd4d4d4d4
+// DSO-NEXT:     .word   0xd4d4d4d4
+// DSO-NEXT:     .word   0xd4d4d4d4
+
+/// 136 + 2 << 16 + 0x1026c = 0x302f4 = got entry 1
+// DSO-NEXT:     10260:       f240 0c88     movw    r12, #136
+// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
+// DSO-NEXT:                  44fc          add     r12, pc
+// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:                  e7fc          b       0x1026a
+/// 124 + 2 << 16 + 0x1027c = 0x302f8 = got entry 2
+// DSO-NEXT:     10270:       f240 0c7c     movw    r12, #124
+// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
+// DSO-NEXT:                  44fc          add     r12, pc
+// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:                  e7fc          b       0x1027a
+/// 112 + 2 << 16 + 0x1028c = 0x302fc = got entry 3
+// DSO-NEXT:     10280:       f240 0c70     movw    r12, #112
+// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
+// DSO-NEXT:                  44fc          add     r12, pc
+// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:                  e7fc          b       0x1028a
+
+// DSOREL: .got.plt PROGBITS 000302e8 {{.*}} 000018 00  WA  0   0  4
+// DSOREL: Relocation section '.rel.plt'
+// DSOREL: 000302f4 {{.*}} R_ARM_JUMP_SLOT {{.*}} func1
+// DSOREL: 000302f8 {{.*}} R_ARM_JUMP_SLOT {{.*}} func2
+// DSOREL: 000302fc {{.*}} R_ARM_JUMP_SLOT {{.*}} func3

From f7c8a0339c64810a3c1b28d9b3b20e02a2be6232 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 15:54:44 -0700
Subject: [PATCH 79/89] [RISCV] Combine vXi32 (mul (and (lshr X, 15), 0x10001),
 0xffff) -> (bitcast (sra (v2Xi16 (bitcast X)), 15)) (#93565)

Similar for i16 and i64 elements for both fixed and scalable vectors.

This reduces the number of vector instructions, but increases vl/vtype
toggles.

This reduces some code in 525.x264_r from SPEC2017. In that usage, the
vectors are fixed with a small number of elements so vsetivli can be
used.

This is similar to `performMulVectorCmpZeroCombine` from AArch64.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  41 +++++++
 llvm/test/CodeGen/RISCV/rvv/mul-combine.ll  | 117 ++++++++++++++++++++
 2 files changed, 158 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/mul-combine.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5fc613c1b2a140..e99c6208594e3b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -13704,6 +13704,44 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Combine vXi32 (mul (and (lshr X, 15), 0x10001), 0xffff) ->
+// (bitcast (sra (v2Xi16 (bitcast X)), 15))
+// Same for other equivalent types with other equivalent constants.
+static SDValue combineVectorMulToSraBitcast(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Do this for legal vectors unless they are i1 or i8 vectors.
+  if (!VT.isVector() || !TLI.isTypeLegal(VT) || VT.getScalarSizeInBits() < 16)
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::AND ||
+      N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
+    return SDValue();
+
+  SDValue And = N->getOperand(0);
+  SDValue Srl = And.getOperand(0);
+
+  APInt V1, V2, V3;
+  if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
+      !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
+      !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
+    return SDValue();
+
+  unsigned HalfSize = VT.getScalarSizeInBits() / 2;
+  if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
+      V3 != (HalfSize - 1))
+    return SDValue();
+
+  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
+                                EVT::getIntegerVT(*DAG.getContext(), HalfSize),
+                                VT.getVectorElementCount() * 2);
+  SDLoc DL(N);
+  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, HalfVT, Srl.getOperand(0));
+  SDValue Sra = DAG.getNode(ISD::SRA, DL, HalfVT, Cast,
+                            DAG.getConstant(HalfSize - 1, DL, HalfVT));
+  return DAG.getNode(ISD::BITCAST, DL, VT, Sra);
+}
 
 static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
@@ -13748,6 +13786,9 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineBinOpOfZExt(N, DAG))
     return V;
 
+  if (SDValue V = combineVectorMulToSraBitcast(N, DAG))
+    return V;
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll
new file mode 100644
index 00000000000000..6a7da925b4d43d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64
+
+define <2 x i16> @test_v2i16(<2 x i16> %x) {
+; CHECK-RV32-LABEL: test_v2i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 7
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_v2i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 7
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <2 x i16> %x, <i16 7, i16 7>
+  %2 = and <2 x i16> %1, <i16 257, i16 257>
+  %3 = mul <2 x i16> %2, <i16 255, i16 255>
+  ret <2 x i16> %3
+}
+
+define <vscale x 2 x i16> @test_nxv2i16(<vscale x 2 x i16> %x) {
+; CHECK-RV32-LABEL: test_nxv2i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-RV32-NEXT:    vsrl.vi v8, v8, 7
+; CHECK-RV32-NEXT:    li a0, 257
+; CHECK-RV32-NEXT:    vand.vx v8, v8, a0
+; CHECK-RV32-NEXT:    vsll.vi v8, v8, 8
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_nxv2i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-RV64-NEXT:    vsrl.vi v8, v8, 7
+; CHECK-RV64-NEXT:    li a0, 257
+; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
+; CHECK-RV64-NEXT:    vsll.vi v8, v8, 8
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <vscale x 2 x i16> %x, splat (i16 7)
+  %2 = and <vscale x 2 x i16> %1, splat (i16 257)
+  %3 = mul <vscale x 2 x i16> %2, splat (i16 256)
+  ret <vscale x 2 x i16> %3
+}
+
+define <2 x i32> @test_v2i32(<2 x i32> %x) {
+; CHECK-RV32-LABEL: test_v2i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_v2i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <2 x i32> %x, <i32 15, i32 15>
+  %2 = and <2 x i32> %1, <i32 65537, i32 65537>
+  %3 = mul <2 x i32> %2, <i32 65535, i32 65535>
+  ret <2 x i32> %3
+}
+
+define <vscale x 2 x i32> @test_nxv2i32(<vscale x 2 x i32> %x) {
+; CHECK-RV32-LABEL: test_nxv2i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_nxv2i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <vscale x 2 x i32> %x, splat (i32 15)
+  %2 = and <vscale x 2 x i32> %1, splat (i32 65537)
+  %3 = mul <vscale x 2 x i32> %2, splat (i32 65535)
+  ret <vscale x 2 x i32> %3
+}
+
+define <2 x i64> @test_v2i64(<2 x i64> %x) {
+; CHECK-RV32-LABEL: test_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <2 x i64> %x, <i64 31, i64 31>
+  %2 = and <2 x i64> %1, <i64 4294967297, i64 4294967297>
+  %3 = mul <2 x i64> %2, <i64 4294967295, i64 4294967295>
+  ret <2 x i64> %3
+}
+
+define <vscale x 2 x i64> @test_nxv2i64(<vscale x 2 x i64> %x) {
+; CHECK-RV32-LABEL: test_nxv2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_nxv2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <vscale x 2 x i64> %x, splat (i64 31)
+  %2 = and <vscale x 2 x i64> %1, splat (i64 4294967297)
+  %3 = mul <vscale x 2 x i64> %2, splat (i64 4294967295)
+  ret <vscale x 2 x i64> %3
+}

From 0694552cb7e8b2041fd5e765cf5b83fc40664087 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 28 May 2024 15:56:17 -0700
Subject: [PATCH 80/89] [libc] clean up MutexLock (#93619)

---
 libc/src/__support/threads/linux/CMakeLists.txt |  1 +
 libc/src/__support/threads/linux/CndVar.cpp     |  7 ++++---
 libc/src/__support/threads/mutex.h              | 14 --------------
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index 39c4ad20201ca6..f6913ef0834289 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -75,4 +75,5 @@ add_object_library(
     libc.src.__support.OSUtil.osutil
     libc.src.__support.threads.linux.futex_word_type
     libc.src.__support.threads.mutex
+    libc.src.__support.CPP.mutex
 )
diff --git a/libc/src/__support/threads/linux/CndVar.cpp b/libc/src/__support/threads/linux/CndVar.cpp
index daf56bca1ed21b..b3a0fdbda4e9ea 100644
--- a/libc/src/__support/threads/linux/CndVar.cpp
+++ b/libc/src/__support/threads/linux/CndVar.cpp
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/threads/CndVar.h"
+#include "src/__support/CPP/mutex.h"
 #include "src/__support/OSUtil/syscall.h"           // syscall_impl
 #include "src/__support/threads/linux/futex_word.h" // FutexWordType
-#include "src/__support/threads/mutex.h"            // Mutex, MutexLock
+#include "src/__support/threads/mutex.h"            // Mutex
 
 #include <sys/syscall.h> // For syscall numbers.
 
@@ -27,7 +28,7 @@ int CndVar::wait(Mutex *m) {
 
   CndWaiter waiter;
   {
-    MutexLock ml(&qmtx);
+    cpp::lock_guard ml(qmtx);
     CndWaiter *old_back = nullptr;
     if (waitq_front == nullptr) {
       waitq_front = waitq_back = &waiter;
@@ -83,7 +84,7 @@ void CndVar::notify_one() {
 }
 
 void CndVar::broadcast() {
-  MutexLock ml(&qmtx);
+  cpp::lock_guard ml(qmtx);
   uint32_t dummy_futex_word;
   CndWaiter *waiter = waitq_front;
   waitq_front = waitq_back = nullptr;
diff --git a/libc/src/__support/threads/mutex.h b/libc/src/__support/threads/mutex.h
index 9dded2e3f952a1..392b38984dc0ae 100644
--- a/libc/src/__support/threads/mutex.h
+++ b/libc/src/__support/threads/mutex.h
@@ -43,18 +43,4 @@
 #include "src/__support/threads/gpu/mutex.h"
 #endif // __linux__
 
-namespace LIBC_NAMESPACE {
-
-// An RAII class for easy locking and unlocking of mutexes.
-class MutexLock {
-  Mutex *mutex;
-
-public:
-  explicit MutexLock(Mutex *m) : mutex(m) { mutex->lock(); }
-
-  ~MutexLock() { mutex->unlock(); }
-};
-
-} // namespace LIBC_NAMESPACE
-
 #endif // LLVM_LIBC_SRC___SUPPORT_THREADS_MUTEX_H

From c179d50fd3d84311708701d84e3bca60570d3d7f Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 28 May 2024 16:10:11 -0700
Subject: [PATCH 81/89] [WebAssembly] Add exnref type (#93586)

This adds (back) the exnref type restored in the new EH proposal adopted
in Oct 2023 CG meeting:

https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md:x
---
 lld/wasm/WriterUtils.cpp                      |  2 ++
 llvm/include/llvm/BinaryFormat/Wasm.h         |  9 ++++---
 llvm/include/llvm/CodeGen/ValueTypes.td       |  9 ++++---
 llvm/include/llvm/IR/Intrinsics.td            |  2 ++
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td | 18 +++++++++++++
 llvm/lib/CodeGen/ValueTypes.cpp               |  1 +
 llvm/lib/Object/WasmObjectFile.cpp            |  8 ++++--
 llvm/lib/ObjectYAML/WasmYAML.cpp              |  2 ++
 .../MCTargetDesc/WebAssemblyMCTargetDesc.h    | 12 +++++++++
 .../WebAssemblyMCTypeUtilities.cpp            |  6 +++++
 .../MCTargetDesc/WebAssemblyMCTypeUtilities.h |  4 ++-
 .../Utils/WebAssemblyTypeUtilities.cpp        |  3 +++
 .../WebAssembly/WebAssemblyAsmPrinter.cpp     |  2 ++
 .../WebAssembly/WebAssemblyExplicitLocals.cpp | 10 +++++++
 .../WebAssembly/WebAssemblyFastISel.cpp       | 16 ++++++++++++
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  3 +++
 .../WebAssembly/WebAssemblyInstrInfo.td       |  3 +++
 .../Target/WebAssembly/WebAssemblyInstrRef.td |  8 +++---
 .../WebAssembly/WebAssemblyInstrTable.td      |  2 ++
 .../WebAssembly/WebAssemblyRegStackify.cpp    |  2 ++
 .../WebAssembly/WebAssemblyRegisterInfo.td    |  2 ++
 .../WebAssembly/WebAssemblyUtilities.cpp      |  2 ++
 .../test/CodeGen/WebAssembly/reg-argument.mir | 11 ++++++++
 llvm/test/CodeGen/WebAssembly/reg-copy.mir    | 11 ++++++++
 llvm/test/MC/WebAssembly/basic-assembly.s     | 21 +++++++++------
 llvm/test/MC/WebAssembly/reference-types.s    | 26 +++++++++++++++++--
 .../test/MC/WebAssembly/type-checker-errors.s | 16 ++++++++++++
 27 files changed, 188 insertions(+), 23 deletions(-)

diff --git a/lld/wasm/WriterUtils.cpp b/lld/wasm/WriterUtils.cpp
index cdd2c42f939efe..c6a1592012e64c 100644
--- a/lld/wasm/WriterUtils.cpp
+++ b/lld/wasm/WriterUtils.cpp
@@ -35,6 +35,8 @@ std::string toString(ValType type) {
     return "funcref";
   case ValType::EXTERNREF:
     return "externref";
+  case ValType::EXNREF:
+    return "exnref";
   case ValType::OTHERREF:
     return "otherref";
   }
diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h
index 38ef8e37df91d3..acf89885af6fdb 100644
--- a/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -58,15 +58,16 @@ enum : unsigned {
   WASM_TYPE_V128 = 0x7B,
   WASM_TYPE_NULLFUNCREF = 0x73,
   WASM_TYPE_NULLEXTERNREF = 0x72,
+  WASM_TYPE_NULLEXNREF = 0x74,
   WASM_TYPE_NULLREF = 0x71,
   WASM_TYPE_FUNCREF = 0x70,
   WASM_TYPE_EXTERNREF = 0x6F,
+  WASM_TYPE_EXNREF = 0x69,
   WASM_TYPE_ANYREF = 0x6E,
   WASM_TYPE_EQREF = 0x6D,
   WASM_TYPE_I31REF = 0x6C,
   WASM_TYPE_STRUCTREF = 0x6B,
   WASM_TYPE_ARRAYREF = 0x6A,
-  WASM_TYPE_EXNREF = 0x69,
   WASM_TYPE_NONNULLABLE = 0x64,
   WASM_TYPE_NULLABLE = 0x63,
   WASM_TYPE_FUNC = 0x60,
@@ -261,8 +262,9 @@ enum class ValType {
   V128 = WASM_TYPE_V128,
   FUNCREF = WASM_TYPE_FUNCREF,
   EXTERNREF = WASM_TYPE_EXTERNREF,
+  EXNREF = WASM_TYPE_EXNREF,
   // Unmodeled value types include ref types with heap types other than
-  // func or extern, and type-specialized funcrefs
+  // func, extern or exn, and type-specialized funcrefs
   OTHERREF = 0xff,
 };
 
@@ -410,7 +412,8 @@ struct WasmDataSegment {
 // 1) Does not model passive or declarative segments (Segment will end up with
 // an Offset field of i32.const 0)
 // 2) Does not model init exprs (Segment will get an empty Functions list)
-// 2) Does not model types other than basic funcref/externref (see ValType)
+// 3) Does not model types other than basic funcref/externref/exnref (see
+// ValType)
 struct WasmElemSegment {
   uint32_t Flags;
   uint32_t TableNumber;
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index c3e378ed8f6edb..e322cc04c1c769 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -280,11 +280,12 @@ def untyped   : ValueType<8,    193> { // Produces an untyped value
 }
 def funcref   : ValueType<0,    194>;  // WebAssembly's funcref type
 def externref : ValueType<0,    195>;  // WebAssembly's externref type
-def x86amx    : ValueType<8192, 196>;  // X86 AMX value
-def i64x8     : ValueType<512,  197>;  // 8 Consecutive GPRs (AArch64)
+def exnref    : ValueType<0,    196>;  // WebAssembly's exnref type
+def x86amx    : ValueType<8192, 197>;  // X86 AMX value
+def i64x8     : ValueType<512,  198>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
-              : ValueType<16,  198>;  // AArch64 predicate-as-counter
-def spirvbuiltin : ValueType<0, 199>; // SPIR-V's builtin type
+              : ValueType<16,  199>;  // AArch64 predicate-as-counter
+def spirvbuiltin : ValueType<0, 200>; // SPIR-V's builtin type
 
 def token      : ValueType<0, 248>;  // TokenTy
 def MetadataVT : ValueType<0, 249> { // Metadata
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 3019f68083d422..c3ac53837444ef 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -316,6 +316,7 @@ def IIT_PPCF128 : IIT_VT<ppcf128, 52>;
 def IIT_V3 : IIT_Vec<3, 53>;
 def IIT_EXTERNREF : IIT_VT<externref, 54>;
 def IIT_FUNCREF : IIT_VT<funcref, 55>;
+def IIT_EXNREF: IIT_VT<exnref, 56>;
 def IIT_I2 : IIT_Int<2, 57>;
 def IIT_I4 : IIT_Int<4, 58>;
 def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
@@ -581,6 +582,7 @@ def llvm_vararg_ty     : LLVMType<isVoid>;   // this means vararg here
 
 def llvm_externref_ty  : LLVMType<externref>;
 def llvm_funcref_ty    : LLVMType<funcref>;
+def llvm_exnref_ty     : LLVMType<exnref>;
 
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 237f268784bb02..47aab196a6d4f9 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -31,12 +31,17 @@ def int_wasm_ref_null_extern :
   DefaultAttrsIntrinsic<[llvm_externref_ty], [], [IntrNoMem]>;
 def int_wasm_ref_null_func :
   DefaultAttrsIntrinsic<[llvm_funcref_ty], [], [IntrNoMem]>;
+def int_wasm_ref_null_exn:
+  DefaultAttrsIntrinsic<[llvm_exnref_ty], [], [IntrNoMem]>;
 def int_wasm_ref_is_null_extern :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_externref_ty], [IntrNoMem],
                         "llvm.wasm.ref.is_null.extern">;
 def int_wasm_ref_is_null_func :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_funcref_ty],
                         [IntrNoMem], "llvm.wasm.ref.is_null.func">;
+def int_wasm_ref_is_null_exn :
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_exnref_ty], [IntrNoMem],
+                        "llvm.wasm.ref.is_null.exn">;
 
 //===----------------------------------------------------------------------===//
 // Table intrinsics
@@ -47,6 +52,9 @@ def int_wasm_table_set_externref :
 def int_wasm_table_set_funcref :
   DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty],
                         [IntrWriteMem]>;
+def int_wasm_table_set_exnref :
+  DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty],
+                        [IntrWriteMem]>;
 
 def int_wasm_table_get_externref :
   DefaultAttrsIntrinsic<[llvm_externref_ty], [llvm_table_ty, llvm_i32_ty],
@@ -54,6 +62,9 @@ def int_wasm_table_get_externref :
 def int_wasm_table_get_funcref :
   DefaultAttrsIntrinsic<[llvm_funcref_ty], [llvm_table_ty, llvm_i32_ty],
                         [IntrReadMem]>;
+def int_wasm_table_get_exnref :
+  DefaultAttrsIntrinsic<[llvm_exnref_ty], [llvm_table_ty, llvm_i32_ty],
+                        [IntrReadMem]>;
 
 // Query the current table size, and increase the current table size.
 def int_wasm_table_size :
@@ -68,6 +79,9 @@ def int_wasm_table_grow_externref :
 def int_wasm_table_grow_funcref :
   DefaultAttrsIntrinsic<[llvm_i32_ty],
                         [llvm_table_ty, llvm_funcref_ty, llvm_i32_ty], []>;
+def int_wasm_table_grow_exnref :
+  DefaultAttrsIntrinsic<[llvm_i32_ty],
+                        [llvm_table_ty, llvm_exnref_ty, llvm_i32_ty], []>;
 def int_wasm_table_fill_externref :
   DefaultAttrsIntrinsic<[],
                         [llvm_table_ty, llvm_i32_ty, llvm_externref_ty,
@@ -76,6 +90,10 @@ def int_wasm_table_fill_funcref :
   DefaultAttrsIntrinsic<[],
                         [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty,
                          llvm_i32_ty], []>;
+def int_wasm_table_fill_exnref :
+  DefaultAttrsIntrinsic<[],
+                        [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty,
+                         llvm_i32_ty], []>;
 
 //===----------------------------------------------------------------------===//
 // Trapping float-to-int conversions
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 3d5c58d282da56..df1c02c3dc67c2 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -181,6 +181,7 @@ std::string EVT::getEVTString() const {
   case MVT::Metadata:  return "Metadata";
   case MVT::Untyped:   return "Untyped";
   case MVT::funcref:   return "funcref";
+  case MVT::exnref:    return "exnref";
   case MVT::externref: return "externref";
   case MVT::aarch64svcount:
     return "aarch64svcount";
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 6507a0e5950ebe..23381955c60a88 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -177,8 +177,8 @@ static uint8_t readOpcode(WasmObjectFile::ReadContext &Ctx) {
 
 static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx,
                                   uint32_t Code) {
-  // only directly encoded FUNCREF/EXTERNREF are supported
-  // (not ref null func or ref null extern)
+  // only directly encoded FUNCREF/EXTERNREF/EXNREF are supported
+  // (not ref null func, ref null extern, or ref null exn)
   switch (Code) {
   case wasm::WASM_TYPE_I32:
   case wasm::WASM_TYPE_I64:
@@ -187,6 +187,7 @@ static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx,
   case wasm::WASM_TYPE_V128:
   case wasm::WASM_TYPE_FUNCREF:
   case wasm::WASM_TYPE_EXTERNREF:
+  case wasm::WASM_TYPE_EXNREF:
     return wasm::ValType(Code);
   }
   if (Code == wasm::WASM_TYPE_NULLABLE || Code == wasm::WASM_TYPE_NONNULLABLE) {
@@ -1288,6 +1289,7 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
       auto ElemType = Im.Table.ElemType;
       if (ElemType != wasm::ValType::FUNCREF &&
           ElemType != wasm::ValType::EXTERNREF &&
+          ElemType != wasm::ValType::EXNREF &&
           ElemType != wasm::ValType::OTHERREF)
         return make_error<GenericBinaryError>("invalid table element type",
                                               object_error::parse_failed);
@@ -1346,6 +1348,7 @@ Error WasmObjectFile::parseTableSection(ReadContext &Ctx) {
     auto ElemType = Tables.back().Type.ElemType;
     if (ElemType != wasm::ValType::FUNCREF &&
         ElemType != wasm::ValType::EXTERNREF &&
+        ElemType != wasm::ValType::EXNREF &&
         ElemType != wasm::ValType::OTHERREF) {
       return make_error<GenericBinaryError>("invalid table element type",
                                             object_error::parse_failed);
@@ -1680,6 +1683,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
         Segment.ElemKind = parseValType(Ctx, ElemKind);
         if (Segment.ElemKind != wasm::ValType::FUNCREF &&
             Segment.ElemKind != wasm::ValType::EXTERNREF &&
+            Segment.ElemKind != wasm::ValType::EXNREF &&
             Segment.ElemKind != wasm::ValType::OTHERREF) {
           return make_error<GenericBinaryError>("invalid elem type",
                                                 object_error::parse_failed);
diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp
index 544a91d03dce01..7ad338f65706d5 100644
--- a/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -606,6 +606,7 @@ void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
   ECase(V128);
   ECase(FUNCREF);
   ECase(EXTERNREF);
+  ECase(EXNREF);
   ECase(OTHERREF);
 #undef ECase
 }
@@ -640,6 +641,7 @@ void ScalarEnumerationTraits<WasmYAML::TableType>::enumeration(
 #define ECase(X) IO.enumCase(Type, #X, CONCAT(X));
   ECase(FUNCREF);
   ECase(EXTERNREF);
+  ECase(EXNREF);
   ECase(OTHERREF);
 #undef ECase
 }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 34502170a5c71f..b7498cb4299452 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -355,6 +355,8 @@ inline bool isArgument(unsigned Opc) {
   case WebAssembly::ARGUMENT_funcref_S:
   case WebAssembly::ARGUMENT_externref:
   case WebAssembly::ARGUMENT_externref_S:
+  case WebAssembly::ARGUMENT_exnref:
+  case WebAssembly::ARGUMENT_exnref_S:
     return true;
   default:
     return false;
@@ -377,6 +379,8 @@ inline bool isCopy(unsigned Opc) {
   case WebAssembly::COPY_FUNCREF_S:
   case WebAssembly::COPY_EXTERNREF:
   case WebAssembly::COPY_EXTERNREF_S:
+  case WebAssembly::COPY_EXNREF:
+  case WebAssembly::COPY_EXNREF_S:
     return true;
   default:
     return false;
@@ -399,6 +403,8 @@ inline bool isTee(unsigned Opc) {
   case WebAssembly::TEE_FUNCREF_S:
   case WebAssembly::TEE_EXTERNREF:
   case WebAssembly::TEE_EXTERNREF_S:
+  case WebAssembly::TEE_EXNREF:
+  case WebAssembly::TEE_EXNREF_S:
     return true;
   default:
     return false;
@@ -489,6 +495,8 @@ inline bool isLocalGet(unsigned Opc) {
   case WebAssembly::LOCAL_GET_FUNCREF_S:
   case WebAssembly::LOCAL_GET_EXTERNREF:
   case WebAssembly::LOCAL_GET_EXTERNREF_S:
+  case WebAssembly::LOCAL_GET_EXNREF:
+  case WebAssembly::LOCAL_GET_EXNREF_S:
     return true;
   default:
     return false;
@@ -511,6 +519,8 @@ inline bool isLocalSet(unsigned Opc) {
   case WebAssembly::LOCAL_SET_FUNCREF_S:
   case WebAssembly::LOCAL_SET_EXTERNREF:
   case WebAssembly::LOCAL_SET_EXTERNREF_S:
+  case WebAssembly::LOCAL_SET_EXNREF:
+  case WebAssembly::LOCAL_SET_EXNREF_S:
     return true;
   default:
     return false;
@@ -533,6 +543,8 @@ inline bool isLocalTee(unsigned Opc) {
   case WebAssembly::LOCAL_TEE_FUNCREF_S:
   case WebAssembly::LOCAL_TEE_EXTERNREF:
   case WebAssembly::LOCAL_TEE_EXTERNREF_S:
+  case WebAssembly::LOCAL_TEE_EXNREF:
+  case WebAssembly::LOCAL_TEE_EXNREF_S:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
index 8ea02bd2ad1ff0..d9c8e22bbbaf5b 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
@@ -27,6 +27,7 @@ std::optional<wasm::ValType> WebAssembly::parseType(StringRef Type) {
              wasm::ValType::V128)
       .Case("funcref", wasm::ValType::FUNCREF)
       .Case("externref", wasm::ValType::EXTERNREF)
+      .Case("exnref", wasm::ValType::EXNREF)
       .Default(std::nullopt);
 }
 
@@ -40,6 +41,7 @@ WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) {
       .Case("v128", WebAssembly::BlockType::V128)
       .Case("funcref", WebAssembly::BlockType::Funcref)
       .Case("externref", WebAssembly::BlockType::Externref)
+      .Case("exnref", WebAssembly::BlockType::Exnref)
       .Case("void", WebAssembly::BlockType::Void)
       .Default(WebAssembly::BlockType::Invalid);
 }
@@ -62,6 +64,8 @@ const char *WebAssembly::anyTypeToString(unsigned Type) {
     return "funcref";
   case wasm::WASM_TYPE_EXTERNREF:
     return "externref";
+  case wasm::WASM_TYPE_EXNREF:
+    return "exnref";
   case wasm::WASM_TYPE_FUNC:
     return "func";
   case wasm::WASM_TYPE_NORESULT:
@@ -110,6 +114,8 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) {
     return wasm::ValType::FUNCREF;
   case WebAssembly::EXTERNREFRegClassID:
     return wasm::ValType::EXTERNREF;
+  case WebAssembly::EXNREFRegClassID:
+    return wasm::ValType::EXNREF;
   default:
     llvm_unreachable("unexpected type");
   }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
index 486cf264d13e2f..063ee4dba9068e 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
@@ -32,6 +32,7 @@ enum class BlockType : unsigned {
   V128 = unsigned(wasm::ValType::V128),
   Externref = unsigned(wasm::ValType::EXTERNREF),
   Funcref = unsigned(wasm::ValType::FUNCREF),
+  Exnref = unsigned(wasm::ValType::EXNREF),
   // Multivalue blocks (and other non-void blocks) are only emitted when the
   // blocks will never be exited and are at the ends of functions (see
   // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
@@ -41,7 +42,8 @@ enum class BlockType : unsigned {
 };
 
 inline bool isRefType(wasm::ValType Type) {
-  return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF;
+  return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF ||
+         Type == wasm::ValType::EXNREF;
 }
 
 // Convert ValType or a list/signature of ValTypes to a string.
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index 867953b4e8d71d..f9293460e701a0 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -33,6 +33,7 @@ MVT WebAssembly::parseMVT(StringRef Type) {
       .Case("v2i64", MVT::v2i64)
       .Case("funcref", MVT::funcref)
       .Case("externref", MVT::externref)
+      .Case("exnref", MVT::exnref)
       .Default(MVT::INVALID_SIMPLE_VALUE_TYPE);
 }
 
@@ -58,6 +59,8 @@ wasm::ValType WebAssembly::toValType(MVT Type) {
     return wasm::ValType::FUNCREF;
   case MVT::externref:
     return wasm::ValType::EXTERNREF;
+  case MVT::exnref:
+    return wasm::ValType::EXNREF;
   default:
     llvm_unreachable("unexpected type");
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 443558537da245..0b7ec6e74cab20 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -125,6 +125,8 @@ static char getInvokeSig(wasm::ValType VT) {
     return 'F';
   case wasm::ValType::EXTERNREF:
     return 'X';
+  case wasm::ValType::EXNREF:
+    return 'E';
   default:
     llvm_unreachable("Unhandled wasm::ValType enum");
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 0159c44a79b76d..3c6a29311a10e4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -100,6 +100,8 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::DROP_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::DROP_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::DROP_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -119,6 +121,8 @@ static unsigned getLocalGetOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_GET_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::LOCAL_GET_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::LOCAL_GET_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -138,6 +142,8 @@ static unsigned getLocalSetOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_SET_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::LOCAL_SET_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::LOCAL_SET_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -157,6 +163,8 @@ static unsigned getLocalTeeOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_TEE_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::LOCAL_TEE_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::LOCAL_TEE_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -176,6 +184,8 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
     return MVT::funcref;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return MVT::externref;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return MVT::exnref;
   llvm_unreachable("unrecognized register class");
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 26e13948bc9a68..aa3aa1b007a530 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -137,6 +137,10 @@ class WebAssemblyFastISel final : public FastISel {
       if (Subtarget->hasReferenceTypes())
         return VT;
       break;
+    case MVT::exnref:
+      if (Subtarget->hasReferenceTypes() && Subtarget->hasExceptionHandling())
+        return VT;
+      break;
     case MVT::f16:
       return MVT::f32;
     case MVT::v16i8:
@@ -717,6 +721,10 @@ bool WebAssemblyFastISel::fastLowerArguments() {
       Opc = WebAssembly::ARGUMENT_externref;
       RC = &WebAssembly::EXTERNREFRegClass;
       break;
+    case MVT::exnref:
+      Opc = WebAssembly::ARGUMENT_exnref;
+      RC = &WebAssembly::EXNREFRegClass;
+      break;
     default:
       return false;
     }
@@ -821,6 +829,9 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     case MVT::externref:
       ResultReg = createResultReg(&WebAssembly::EXTERNREFRegClass);
       break;
+    case MVT::exnref:
+      ResultReg = createResultReg(&WebAssembly::EXNREFRegClass);
+      break;
     default:
       return false;
     }
@@ -948,6 +959,10 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
     Opc = WebAssembly::SELECT_EXTERNREF;
     RC = &WebAssembly::EXTERNREFRegClass;
     break;
+  case MVT::exnref:
+    Opc = WebAssembly::SELECT_EXNREF;
+    RC = &WebAssembly::EXNREFRegClass;
+    break;
   default:
     return false;
   }
@@ -1355,6 +1370,7 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
   case MVT::v2f64:
   case MVT::funcref:
   case MVT::externref:
+  case MVT::exnref:
     break;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 518b6932a0c879..f9f16498bb390c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -76,6 +76,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   if (Subtarget->hasReferenceTypes()) {
     addRegisterClass(MVT::externref, &WebAssembly::EXTERNREFRegClass);
     addRegisterClass(MVT::funcref, &WebAssembly::FUNCREFRegClass);
+    if (Subtarget->hasExceptionHandling()) {
+      addRegisterClass(MVT::exnref, &WebAssembly::EXNREFRegClass);
+    }
   }
   // Compute derived properties from the register classes.
   computeRegisterProperties(Subtarget->getRegisterInfo());
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index c1a5a45395e87d..3d37eb2fa27bce 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -292,6 +292,7 @@ defm "": ARGUMENT<F32, f32>;
 defm "": ARGUMENT<F64, f64>;
 defm "": ARGUMENT<FUNCREF, funcref>;
 defm "": ARGUMENT<EXTERNREF, externref>;
+defm "": ARGUMENT<EXNREF, exnref>;
 
 // local.get and local.set are not generated by instruction selection; they
 // are implied by virtual register uses and defs.
@@ -375,6 +376,8 @@ defm "" : LOCAL<F64, global_op32>;
 defm "" : LOCAL<V128, global_op32>, Requires<[HasSIMD128]>;
 defm "" : LOCAL<FUNCREF, global_op32>, Requires<[HasReferenceTypes]>;
 defm "" : LOCAL<EXTERNREF, global_op32>, Requires<[HasReferenceTypes]>;
+defm "" : LOCAL<EXNREF, global_op32>,
+          Requires<[HasReferenceTypes, HasExceptionHandling]>;
 
 let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 608963d588635e..2654a09387fd4a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -17,8 +17,9 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
                         [(set rc:$dst, (!cast<Intrinsic>("int_wasm_ref_null_" # ht)))],
                         "ref.null_" # ht # "$dst",
                         "ref.null_" # ht,
-                        !cond(!eq(ht, "func")   : 0xd070, 
-                              !eq(ht, "extern") : 0xd06f)>,
+                        !cond(!eq(ht, "func")   : 0xd070,
+                              !eq(ht, "extern") : 0xd06f,
+                              !eq(ht, "exn")    : 0xd069)>,
                       Requires<[HasReferenceTypes]>;
   defm SELECT_#rc: I<(outs rc:$dst), (ins rc:$lhs, rc:$rhs, I32:$cond),
                      (outs), (ins),
@@ -37,8 +38,9 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
 
 defm "" : REF_I<FUNCREF, funcref, "func">;
 defm "" : REF_I<EXTERNREF, externref, "extern">;
+defm "" : REF_I<EXNREF, exnref, "exn">;
 
-foreach rc = [FUNCREF, EXTERNREF] in {
+foreach rc = [FUNCREF, EXTERNREF, EXNREF] in {
 def : Pat<(select (i32 (setne I32:$cond, 0)), rc:$lhs, rc:$rhs),
           (!cast<Instruction>("SELECT_"#rc) rc:$lhs, rc:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), rc:$lhs, rc:$rhs),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
index 069ce5e3bc94a9..02f0ab8577c3d0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
@@ -64,6 +64,8 @@ multiclass TABLE<WebAssemblyRegClass rc, string suffix> {
 
 defm "" : TABLE<FUNCREF, "funcref">, Requires<[HasReferenceTypes]>;
 defm "" : TABLE<EXTERNREF, "externref">, Requires<[HasReferenceTypes]>;
+defm "" : TABLE<EXNREF, "exnref">,
+          Requires<[HasReferenceTypes, HasExceptionHandling]>;
 
 def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r),
           (TABLE_SET_FUNCREF mcsym:$table, i32:$idx, funcref:$r)>,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index ef174e1716ef1e..d4edb6bf18d932 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -504,6 +504,8 @@ static unsigned getTeeOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::TEE_EXTERNREF;
   if (RC == &WebAssembly::FUNCREFRegClass)
     return WebAssembly::TEE_FUNCREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::TEE_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 4e2faa608be077..17889dacc868c2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -45,6 +45,7 @@ def V128_0: WebAssemblyReg<"%v128">;
 
 def FUNCREF_0 : WebAssemblyReg<"%funcref.0">;
 def EXTERNREF_0 : WebAssemblyReg<"%externref.0">;
+def EXNREF_0 : WebAssemblyReg<"%exnref.0">;
 
 // The value stack "register". This is an opaque entity which serves to order
 // uses and defs that must remain in LIFO order.
@@ -68,3 +69,4 @@ def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8,
                                128, (add V128_0)>;
 def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
 def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
+def EXNREF : WebAssemblyRegClass<[exnref], 0, (add EXNREF_0)>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index 60e872549f87d9..5e7279808cce63 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -175,6 +175,8 @@ unsigned WebAssembly::getCopyOpcodeForRegClass(const TargetRegisterClass *RC) {
     return WebAssembly::COPY_FUNCREF;
   case WebAssembly::EXTERNREFRegClassID:
     return WebAssembly::COPY_EXTERNREF;
+  case WebAssembly::EXNREFRegClassID:
+    return WebAssembly::COPY_EXNREF;
   default:
     llvm_unreachable("Unexpected register class");
   }
diff --git a/llvm/test/CodeGen/WebAssembly/reg-argument.mir b/llvm/test/CodeGen/WebAssembly/reg-argument.mir
index 23e66dfc71fa1b..a549990bdb0a2b 100644
--- a/llvm/test/CodeGen/WebAssembly/reg-argument.mir
+++ b/llvm/test/CodeGen/WebAssembly/reg-argument.mir
@@ -68,3 +68,14 @@ body: |
     %1:externref = ARGUMENT_externref 0, implicit $arguments
     RETURN implicit-def $arguments
 ...
+---
+name: argument_exnref
+# CHECK-LABEL: argument_exnref
+body: |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %1:exnref = ARGUMENT_exnref 0
+  bb.0:
+    %0:i32 = CONST_I32 0, implicit-def $arguments
+    %1:exnref = ARGUMENT_exnref 0, implicit $arguments
+    RETURN implicit-def $arguments
+...
diff --git a/llvm/test/CodeGen/WebAssembly/reg-copy.mir b/llvm/test/CodeGen/WebAssembly/reg-copy.mir
index 31a5bfa63a4ea2..763fe42d07b61a 100644
--- a/llvm/test/CodeGen/WebAssembly/reg-copy.mir
+++ b/llvm/test/CodeGen/WebAssembly/reg-copy.mir
@@ -77,3 +77,14 @@ body: |
     %0:externref = COPY %1:externref
     RETURN implicit-def $arguments
 ...
+---
+name: copy_exnref
+# CHECK-LABEL: copy_exnref
+body: |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:exnref = COPY_EXNREF %1:exnref
+  ; CHECK-NEXT: RETURN
+  bb.0:
+    %0:exnref = COPY %1:exnref
+    RETURN implicit-def $arguments
+...
diff --git a/llvm/test/MC/WebAssembly/basic-assembly.s b/llvm/test/MC/WebAssembly/basic-assembly.s
index 769cd7edfa8a3e..ac358c1b5c7a52 100644
--- a/llvm/test/MC/WebAssembly/basic-assembly.s
+++ b/llvm/test/MC/WebAssembly/basic-assembly.s
@@ -146,12 +146,14 @@ test0:
 
     .ident      "clang version 9.0.0 (trunk 364502) (llvm/trunk 364571)"
 
-.tabletype empty_eref_table, externref
-empty_eref_table:
+.tabletype empty_externref_table, externref
+empty_externref_table:
 
-.tabletype empty_fref_table, funcref
-empty_fref_table:
+.tabletype empty_funcref_table, funcref
+empty_funcref_table:
 
+.tabletype empty_exnref_table, exnref
+empty_exnref_table:
 
 # CHECK:           .text
 # CHECK:           .globaltype __stack_pointer, i32
@@ -283,8 +285,11 @@ empty_fref_table:
 # CHECK-NEXT:      .p2align    2
 # CHECK-NEXT:      .int32      test0
 
-# CHECK:           .tabletype empty_eref_table, externref
-# CHECK-NEXT: empty_eref_table:
+# CHECK:           .tabletype empty_externref_table, externref
+# CHECK-NEXT: empty_externref_table:
 
-# CHECK:           .tabletype empty_fref_table, funcref
-# CHECK-NEXT: empty_fref_table:
+# CHECK:           .tabletype empty_funcref_table, funcref
+# CHECK-NEXT: empty_funcref_table:
+
+# CHECK:           .tabletype empty_exnref_table, exnref
+# CHECK-NEXT: empty_exnref_table:
diff --git a/llvm/test/MC/WebAssembly/reference-types.s b/llvm/test/MC/WebAssembly/reference-types.s
index ab3e3ee6b155b1..2f8bfba68dcea1 100644
--- a/llvm/test/MC/WebAssembly/reference-types.s
+++ b/llvm/test/MC/WebAssembly/reference-types.s
@@ -4,22 +4,27 @@
 # CHECK-LABEL:ref_is_null:
 # CHECK: ref.is_null     # encoding: [0xd1]
 ref_is_null:
-  .functype ref_is_null () -> (i32, i32)
+  .functype ref_is_null () -> (i32, i32, i32)
   ref.null_extern
   ref.is_null
   ref.null_func
   ref.is_null
+  ref.null_exn
+  ref.is_null
   end_function
 
 # CHECK-LABEL: ref_null_test:
 # CHECK: ref.null_func   # encoding: [0xd0,0x70]
 # CHECK: ref.null_extern # encoding: [0xd0,0x6f]
+# CHECK: ref.null_exn    # encoding: [0xd0,0x69]
 ref_null_test:
   .functype ref_null_test () -> ()
   ref.null_func
   drop
   ref.null_extern
   drop
+  ref.null_exn
+  drop
   end_function
 
 # CHECK-LABEL: ref_sig_test_funcref:
@@ -36,9 +41,17 @@ ref_sig_test_externref:
   local.get 0
   end_function
 
+# CHECK-LABEL: ref_sig_test_exnref:
+# CHECK-NEXT: .functype ref_sig_test_exnref (exnref) -> (exnref)
+ref_sig_test_exnref:
+  .functype ref_sig_test_exnref (exnref) -> (exnref)
+  local.get 0
+  end_function
+
 # CHECK-LABEL: ref_select_test:
 # CHECK: funcref.select   # encoding: [0x1b]
 # CHECK: externref.select # encoding: [0x1b]
+# CHECK: exnref.select    # encoding: [0x1b]
 ref_select_test:
   .functype ref_select_test () -> ()
   ref.null_func
@@ -51,15 +64,24 @@ ref_select_test:
   i32.const 0
   externref.select
   drop
+  ref.null_exn
+  ref.null_exn
+  i32.const 0
+  exnref.select
+  drop
   end_function
 
 # CHECK-LABEL: ref_block_test:
 # CHECK: block funcref
 # CHECK: block externref
+# CHECK: block exnref
 ref_block_test:
-  .functype ref_block_test () -> (externref, funcref)
+  .functype ref_block_test () -> (exnref, externref, funcref)
   block funcref
   block externref
+  block exnref
+  ref.null_exn
+  end_block
   ref.null_extern
   end_block
   ref.null_func
diff --git a/llvm/test/MC/WebAssembly/type-checker-errors.s b/llvm/test/MC/WebAssembly/type-checker-errors.s
index 5e28d117501e98..d2841250137a8c 100644
--- a/llvm/test/MC/WebAssembly/type-checker-errors.s
+++ b/llvm/test/MC/WebAssembly/type-checker-errors.s
@@ -215,6 +215,22 @@ table_fill_type_mismatch_3:
   table.fill valid_table
   end_function
 
+table_fill_type_mismatch_4:
+  .functype table_fill_type_mismatch_4 () -> ()
+  ref.null_exn
+  i32.const 1
+# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref
+  table.fill valid_table
+  end_function
+
+table_fill_type_mismatch_5:
+  .functype table_fill_type_mismatch_5 () -> ()
+  ref.null_exn
+  i32.const 1
+# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref
+  table.fill valid_table
+  end_function
+
 table_grow_non_exist_table:
   .functype table_grow_non_exist_table (externref, i32) -> (i32)
   local.get 0

From 4486fcba756bfa4c8729673a9533578232f0bc04 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Tue, 28 May 2024 19:14:26 -0400
Subject: [PATCH 82/89] [libc] Add proxy header for float.h. (#93504)

This is the continuation of
https://github.com/llvm/llvm-project/pull/88674.

Fixes #88433, #90496.

---------

Co-authored-by: aniplcc <aniplccode@gmail.com>
---
 libc/hdr/CMakeLists.txt                       | 10 ++++++
 libc/hdr/float_macros.h                       | 22 ++++++++++++
 libc/include/llvm-libc-macros/float-macros.h  | 35 ++++++++-----------
 .../macros/properties/CMakeLists.txt          |  2 +-
 libc/src/__support/macros/properties/types.h  |  2 +-
 libc/src/math/generic/CMakeLists.txt          |  4 +++
 libc/src/math/generic/scalbn.cpp              | 11 +++---
 libc/src/math/generic/scalbnf.cpp             | 11 +++---
 libc/src/math/generic/scalbnf128.cpp          | 13 +++----
 libc/src/math/generic/scalbnl.cpp             | 11 +++---
 .../llvm-project-overlay/libc/BUILD.bazel     |  7 +++-
 11 files changed, 78 insertions(+), 50 deletions(-)
 create mode 100644 libc/hdr/float_macros.h

diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 91b8cb71552a71..66b82c84dac499 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -87,4 +87,14 @@ add_proxy_header_library(
     libc.include.llvm-libc-macros.time_macros
 )
 
+add_proxy_header_library(
+  float_macros
+  HDRS
+    float_macros.h
+  DEPENDS
+    libc.include.llvm-libc-macros.float_macros
+  FULL_BUILD_DEPENDS
+    libc.include.float
+)
+
 add_subdirectory(types)
diff --git a/libc/hdr/float_macros.h b/libc/hdr/float_macros.h
new file mode 100644
index 00000000000000..a0ef5e29b98687
--- /dev/null
+++ b/libc/hdr/float_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from math.h ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_FLOAT_MACROS_H
+#define LLVM_LIBC_HDR_FLOAT_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/float-macros.h"
+
+#else // Overlay mode
+
+#include <float.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_FLOAT_MACROS_H
diff --git a/libc/include/llvm-libc-macros/float-macros.h b/libc/include/llvm-libc-macros/float-macros.h
index 4fe8590c5f70c8..81c1df868bf6cd 100644
--- a/libc/include/llvm-libc-macros/float-macros.h
+++ b/libc/include/llvm-libc-macros/float-macros.h
@@ -9,21 +9,6 @@
 #ifndef LLVM_LIBC_MACROS_FLOAT_MACROS_H
 #define LLVM_LIBC_MACROS_FLOAT_MACROS_H
 
-// Suppress `#include_next is a language extension` warnings.
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wgnu-include-next"
-#pragma clang diagnostic ignored "-Winclude-next-absolute-path"
-#else // gcc
-#pragma GCC system_header
-#endif //__clang__
-
-#include_next <float.h>
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif //__clang__
-
 #ifndef FLT_RADIX
 #define FLT_RADIX __FLT_RADIX__
 #endif // FLT_RADIX
@@ -32,9 +17,13 @@
 #define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
 #endif // FLT_EVAL_METHOD
 
-#ifndef DECIMAL_DIG
-#define DECIMAL_DIG __DECIMAL_DIG__
-#endif // DECIMAL_DIG
+#ifndef FLT_ROUNDS
+#if __has_builtin(__builtin_flt_rounds)
+#define FLT_ROUNDS __builtin_flt_rounds()
+#else
+#define FLT_ROUNDS 1
+#endif
+#endif // FLT_ROUNDS
 
 #ifndef FLT_DECIMAL_DIG
 #define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
@@ -48,6 +37,10 @@
 #define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
 #endif // LDBL_DECIMAL_DIG
 
+#ifndef DECIMAL_DIG
+#define DECIMAL_DIG __DECIMAL_DIG__
+#endif // DECIMAL_DIG
+
 #ifndef FLT_DIG
 #define FLT_DIG __FLT_DIG__
 #endif // FLT_DIG
@@ -97,15 +90,15 @@
 #endif // LDBL_MAX
 
 #ifndef FLT_TRUE_MIN
-#define FLT_TRUE_MIN __FLT_TRUE_MIN__
+#define FLT_TRUE_MIN __FLT_DENORM_MIN__
 #endif // FLT_TRUE_MIN
 
 #ifndef DBL_TRUE_MIN
-#define DBL_TRUE_MIN __DBL_TRUE_MIN__
+#define DBL_TRUE_MIN __DBL_DENORM_MIN__
 #endif // DBL_TRUE_MIN
 
 #ifndef LDBL_TRUE_MIN
-#define LDBL_TRUE_MIN __LDBL_TRUE_MIN__
+#define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
 #endif // LDBL_TRUE_MIN
 
 #ifndef FLT_EPSILON
diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt
index bbc45650f3fca3..7718aeaa3de5af 100644
--- a/libc/src/__support/macros/properties/CMakeLists.txt
+++ b/libc/src/__support/macros/properties/CMakeLists.txt
@@ -33,6 +33,6 @@ add_header_library(
     .compiler
     .cpu_features
     .os
-    libc.include.llvm-libc-macros.float_macros
+    libc.hdr.float_macros
     libc.include.llvm-libc-types.float128
 )
diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h
index d43cf99e6859be..781cf1b7a2b627 100644
--- a/libc/src/__support/macros/properties/types.h
+++ b/libc/src/__support/macros/properties/types.h
@@ -10,7 +10,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
 
-#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
+#include "hdr/float_macros.h"                      // LDBL_MANT_DIG
 #include "include/llvm-libc-types/float128.h"      // float128
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index daaf505008ca11..269bc6be5d8343 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -2933,6 +2933,7 @@ add_entrypoint_object(
   HDRS
     ../scalbn.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -2945,6 +2946,7 @@ add_entrypoint_object(
   HDRS
     ../scalbnf.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -2957,6 +2959,7 @@ add_entrypoint_object(
   HDRS
     ../scalbnl.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -2969,6 +2972,7 @@ add_entrypoint_object(
   HDRS
     ../scalbnf128.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
diff --git a/libc/src/math/generic/scalbn.cpp b/libc/src/math/generic/scalbn.cpp
index 3908f5892f144f..207cce1550bc01 100644
--- a/libc/src/math/generic/scalbn.cpp
+++ b/libc/src/math/generic/scalbn.cpp
@@ -7,19 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbn.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, scalbn, (double x, int n)) {
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnf.cpp b/libc/src/math/generic/scalbnf.cpp
index 4a4fa86dcfd895..e478088d3ce5a5 100644
--- a/libc/src/math/generic/scalbnf.cpp
+++ b/libc/src/math/generic/scalbnf.cpp
@@ -7,19 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbnf.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, scalbnf, (float x, int n)) {
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnf128.cpp b/libc/src/math/generic/scalbnf128.cpp
index be3d29ed27e985..5fd59611d53de7 100644
--- a/libc/src/math/generic/scalbnf128.cpp
+++ b/libc/src/math/generic/scalbnf128.cpp
@@ -7,21 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbnf128.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float128, scalbnf128, (float128 x, int n)) {
-// TODO: should be switched to use `FLT_RADIX` in hdr/float_macros.h" instead
-// see: https://github.com/llvm/llvm-project/issues/90496
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnl.cpp b/libc/src/math/generic/scalbnl.cpp
index 681338ec01f078..1225a7ebaf572d 100644
--- a/libc/src/math/generic/scalbnl.cpp
+++ b/libc/src/math/generic/scalbnl.cpp
@@ -7,19 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbnl.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(long double, scalbnl, (long double x, int n)) {
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 446499cf15d7b4..70ec3a48a5e2e3 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -127,6 +127,11 @@ libc_support_library(
     hdrs = ["hdr/time_macros.h"],
 )
 
+libc_support_library(
+    name = "hdr_float_macros",
+    hdrs = ["hdr/float_macros.h"],
+)
+
 ############################ Type Proxy Header Files ###########################
 
 libc_support_library(
@@ -189,7 +194,7 @@ libc_support_library(
         ":__support_macros_properties_compiler",
         ":__support_macros_properties_cpu_features",
         ":__support_macros_properties_os",
-        ":llvm_libc_macros_float_macros",
+        ":hdr_float_macros",
         ":llvm_libc_types_float128",
     ],
 )

From 39e5036c0e22cea24df73d28746bb8fe0a117f9d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 28 May 2024 16:25:54 -0700
Subject: [PATCH 83/89] [SCEV] Add predicated version of
 getSymbolicMaxBackedgeTakenCount. (#93498)

This patch adds a predicated version of
getSymbolicMaxBackedgeTakenCount.

The intended use for this is loop access analysis for loops with
uncountable exits. When analyzing dependences and computing runtime
checks, we need the smallest upper bound on the number of iterations. In
terms of memory safety, it shouldn't matter if any uncomputable exits
leave the loop, as long as we prove that there are no dependences given
the minimum of the countable exits. The same should apply also for
generating runtime checks.

PR: https://github.com/llvm/llvm-project/pull/93498
---
 llvm/include/llvm/Analysis/ScalarEvolution.h  | 19 +++++++-
 llvm/lib/Analysis/ScalarEvolution.cpp         | 48 +++++++++++++++++--
 ...cated-symbolic-max-backedge-taken-count.ll |  6 +++
 3 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 1d016b28347d27..72f3d945424963 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -912,6 +912,13 @@ class ScalarEvolution {
     return getBackedgeTakenCount(L, SymbolicMaximum);
   }
 
+  /// Similar to getSymbolicMaxBackedgeTakenCount, except it will add a set of
+  /// SCEV predicates to Predicates that are required to be true in order for
+  /// the answer to be correct. Predicates can be checked with run-time
+  /// checks and can be used to perform loop versioning.
+  const SCEV *getPredicatedSymbolicMaxBackedgeTakenCount(
+      const Loop *L, SmallVector<const SCEVPredicate *, 4> &Predicates);
+
   /// Return true if the backedge taken count is either the value returned by
   /// getConstantMaxBackedgeTakenCount or zero.
   bool isBackedgeTakenCountMaxOrZero(const Loop *L);
@@ -1549,7 +1556,9 @@ class ScalarEvolution {
                                ScalarEvolution *SE) const;
 
     /// Get the symbolic max backedge taken count for the loop.
-    const SCEV *getSymbolicMax(const Loop *L, ScalarEvolution *SE);
+    const SCEV *
+    getSymbolicMax(const Loop *L, ScalarEvolution *SE,
+                   SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr);
 
     /// Get the symbolic max backedge taken count for the particular loop exit.
     const SCEV *getSymbolicMax(const BasicBlock *ExitingBlock,
@@ -1746,7 +1755,7 @@ class ScalarEvolution {
 
   /// Similar to getBackedgeTakenInfo, but will add predicates as required
   /// with the purpose of returning complete information.
-  const BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L);
+  BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L);
 
   /// Compute the number of times the specified loop will iterate.
   /// If AllowPredicates is set, we will create new SCEV predicates as
@@ -2311,6 +2320,9 @@ class PredicatedScalarEvolution {
   /// Get the (predicated) backedge count for the analyzed loop.
   const SCEV *getBackedgeTakenCount();
 
+  /// Get the (predicated) symbolic max backedge count for the analyzed loop.
+  const SCEV *getSymbolicMaxBackedgeTakenCount();
+
   /// Adds a new predicate.
   void addPredicate(const SCEVPredicate &Pred);
 
@@ -2379,6 +2391,9 @@ class PredicatedScalarEvolution {
 
   /// The backedge taken count.
   const SCEV *BackedgeCount = nullptr;
+
+  /// The symbolic backedge taken count.
+  const SCEV *SymbolicMaxBackedgeCount = nullptr;
 };
 
 template <> struct DenseMapInfo<ScalarEvolution::FoldID> {
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index bb56b41fe15d58..e46d7183a2a359 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8295,6 +8295,11 @@ const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L,
   llvm_unreachable("Invalid ExitCountKind!");
 }
 
+const SCEV *ScalarEvolution::getPredicatedSymbolicMaxBackedgeTakenCount(
+    const Loop *L, SmallVector<const SCEVPredicate *, 4> &Preds) {
+  return getPredicatedBackedgeTakenInfo(L).getSymbolicMax(L, this, &Preds);
+}
+
 bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) {
   return getBackedgeTakenInfo(L).isConstantMaxOrZero(this);
 }
@@ -8311,7 +8316,7 @@ static void PushLoopPHIs(const Loop *L,
       Worklist.push_back(&PN);
 }
 
-const ScalarEvolution::BackedgeTakenInfo &
+ScalarEvolution::BackedgeTakenInfo &
 ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) {
   auto &BTI = getBackedgeTakenInfo(L);
   if (BTI.hasFullInfo())
@@ -8644,9 +8649,9 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const {
   return getConstantMax();
 }
 
-const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L,
-                                                   ScalarEvolution *SE) {
+const SCEV *ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(
+    const Loop *L, ScalarEvolution *SE,
+    SmallVector<const SCEVPredicate *, 4> *Predicates) {
   if (!SymbolicMax) {
     // Form an expression for the maximum exit count possible for this loop. We
     // merge the max and exact information to approximate a version of
@@ -8661,6 +8666,12 @@ ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L,
                "We should only have known counts for exiting blocks that "
                "dominate latch!");
         ExitCounts.push_back(ExitCount);
+        if (Predicates)
+          for (const auto *P : ENT.Predicates)
+            Predicates->push_back(P);
+
+        assert((Predicates || ENT.hasAlwaysTruePredicate()) &&
+               "Predicate should be always true!");
       }
     }
     if (ExitCounts.empty())
@@ -13609,6 +13620,24 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
       P->print(OS, 4);
   }
 
+  Preds.clear();
+  auto *PredSymbolicMax =
+      SE->getPredicatedSymbolicMaxBackedgeTakenCount(L, Preds);
+  if (SymbolicBTC != PredSymbolicMax) {
+    OS << "Loop ";
+    L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+    OS << ": ";
+    if (!isa<SCEVCouldNotCompute>(PredSymbolicMax)) {
+      OS << "Predicated symbolic max backedge-taken count is ";
+      PrintSCEVWithTypeHint(OS, PredSymbolicMax);
+    } else
+      OS << "Unpredictable predicated symbolic max backedge-taken count.";
+    OS << "\n";
+    OS << " Predicates:\n";
+    for (const auto *P : Preds)
+      P->print(OS, 4);
+  }
+
   if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
     OS << "Loop ";
     L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
@@ -14822,6 +14851,17 @@ const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() {
   return BackedgeCount;
 }
 
+const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() {
+  if (!SymbolicMaxBackedgeCount) {
+    SmallVector<const SCEVPredicate *, 4> Preds;
+    SymbolicMaxBackedgeCount =
+        SE.getPredicatedSymbolicMaxBackedgeTakenCount(&L, Preds);
+    for (const auto *P : Preds)
+      addPredicate(*P);
+  }
+  return SymbolicMaxBackedgeCount;
+}
+
 void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
   if (Preds->implies(&Pred))
     return;
diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
index d40416359b65c6..8dc79a54eb97a5 100644
--- a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
+++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
@@ -12,6 +12,9 @@ define void @test1(i64 %x, ptr %a, ptr %b) {
 ; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
 ; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
 ;
 entry:
   br label %header
@@ -52,6 +55,9 @@ define void @test2(i64 %x, ptr %a) {
 ; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
 ; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
 ;
 entry:
   br label %header

From 722a5fce589cea76a0baf89ce731477bae8cf4b8 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 28 May 2024 16:27:04 -0700
Subject: [PATCH 84/89] [WebAssembly] Add -wasm-enable-exnref option (#93597)

This adds `-wasm-enable-exnref`, which will enable the new EH
instructions using `exnref` (adopted in Oct 2023 CG meeting):
https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md
This option should be used with `-wasm-enable-eh`.
---
 .../WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp   | 7 +++++++
 .../WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h     | 1 +
 llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp   | 4 ++++
 llvm/test/CodeGen/WebAssembly/eh-option-errors.ll          | 3 +++
 4 files changed, 15 insertions(+)

diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index e8f58a19d25e3b..71dfe1062956e3 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -54,6 +54,13 @@ cl::opt<bool>
 // setjmp/longjmp handling using wasm EH instrutions
 cl::opt<bool> WebAssembly::WasmEnableSjLj(
     "wasm-enable-sjlj", cl::desc("WebAssembly setjmp/longjmp handling"));
+// Whether we use the new exnref Wasm EH proposal adopted on Oct 2023.
+// Should be used with -wasm-enable-eh.
+// Currently set to false by default, but will later change to true and then
+// later can be removed after the legacy WAsm EH instructions are removed.
+cl::opt<bool> WebAssembly::WasmEnableExnref(
+    "wasm-enable-exnref", cl::desc("WebAssembly exception handling (exnref)"),
+    cl::init(false));
 
 static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
                                   const Triple &TT,
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index b7498cb4299452..7f1a5f616ed484 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -44,6 +44,7 @@ extern cl::opt<bool> WasmEnableEmEH;   // asm.js-style EH
 extern cl::opt<bool> WasmEnableEmSjLj; // asm.js-style SjLJ
 extern cl::opt<bool> WasmEnableEH;     // EH using Wasm EH instructions
 extern cl::opt<bool> WasmEnableSjLj;   // SjLj using Wasm EH instructions
+extern cl::opt<bool> WasmEnableExnref; // EH using new Wasm EH (exnref)
 
 enum OperandType {
   /// Basic block label in a branch construct.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 68126992ddcd72..fd92a35c2638a5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -385,6 +385,7 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
 using WebAssembly::WasmEnableEH;
 using WebAssembly::WasmEnableEmEH;
 using WebAssembly::WasmEnableEmSjLj;
+using WebAssembly::WasmEnableExnref;
 using WebAssembly::WasmEnableSjLj;
 
 static void basicCheckForEHAndSjLj(TargetMachine *TM) {
@@ -401,6 +402,9 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) {
   if (WasmEnableEmEH && WasmEnableSjLj)
     report_fatal_error(
         "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj");
+  if (WasmEnableExnref && !WasmEnableEH)
+    report_fatal_error(
+        "-wasm-enable-exnref should be used with -wasm-enable-eh");
 
   // Here we make sure TargetOptions.ExceptionModel is the same as
   // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang
diff --git a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
index 74d02ddc405d3f..52a6364e122589 100644
--- a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
+++ b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
@@ -9,6 +9,9 @@ target triple = "wasm32-unknown-unknown"
 ; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_SJLJ
 ; EM_EH_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj
 
+; RUN: not --crash llc < %s -wasm-enable-exnref 2>&1 | FileCheck %s --check-prefix=WASM_EXNREF_ONLY
+; WASM_EXNREF_ONLY: LLVM ERROR: -wasm-enable-exnref should be used with -wasm-enable-eh
+
 ; RUN: not --crash llc < %s -wasm-enable-eh -exception-model=dwarf 2>&1 | FileCheck %s --check-prefix=EH_MODEL_DWARF
 ; EH_MODEL_DWARF: LLVM ERROR: -exception-model should be either 'none' or 'wasm'
 

From 60bce6eab4d734b86f49b7638856eb8899bc89e8 Mon Sep 17 00:00:00 2001
From: Brendan Dahl <brendan.dahl@gmail.com>
Date: Tue, 28 May 2024 16:33:20 -0700
Subject: [PATCH 85/89] [WebAssembly] Implement all f16x8 binary instructions.
 (#93360)

This reuses most of the code that was created for f32x4 and f64x2 binary
instructions and tries to follow how they were implemented.

add/sub/mul/div - use regular LL instructions
min/max - use the minimum/maximum intrinsic, and also have builtins
pmin/pmax - use the wasm.pmax/pmin intrinsics and also have builtins

Specified at:

https://github.com/WebAssembly/half-precision/blob/29a9b9462c9285d4ccc1a5dc39214ddfd1892658/proposals/half-precision/Overview.md
---
 .../clang/Basic/BuiltinsWebAssembly.def       |  4 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |  4 ++
 clang/test/CodeGen/builtins-wasm.c            | 24 +++++++
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  5 ++
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 43 +++++++++---
 .../CodeGen/WebAssembly/half-precision.ll     | 68 +++++++++++++++++++
 llvm/test/MC/WebAssembly/simd-encodings.s     | 24 +++++++
 7 files changed, 163 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index fd8c1b480d6da0..4e48ff48b60f5f 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -135,6 +135,10 @@ TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_min_f16x8, "V8hV8hV8h", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "half-precision")
 
 TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5edf8c79709131..a3c65105033247 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20806,6 +20806,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   }
   case WebAssembly::BI__builtin_wasm_min_f32:
   case WebAssembly::BI__builtin_wasm_min_f64:
+  case WebAssembly::BI__builtin_wasm_min_f16x8:
   case WebAssembly::BI__builtin_wasm_min_f32x4:
   case WebAssembly::BI__builtin_wasm_min_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
@@ -20816,6 +20817,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   }
   case WebAssembly::BI__builtin_wasm_max_f32:
   case WebAssembly::BI__builtin_wasm_max_f64:
+  case WebAssembly::BI__builtin_wasm_max_f16x8:
   case WebAssembly::BI__builtin_wasm_max_f32x4:
   case WebAssembly::BI__builtin_wasm_max_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
@@ -20824,6 +20826,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_pmin_f16x8:
   case WebAssembly::BI__builtin_wasm_pmin_f32x4:
   case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
@@ -20832,6 +20835,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_pmax_f16x8:
   case WebAssembly::BI__builtin_wasm_pmax_f32x4:
   case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 93a6ab06081c99..d6ee4f68700dca 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -825,6 +825,30 @@ float extract_lane_f16x8(f16x8 a, int i) {
   // WEBASSEMBLY-NEXT: ret float %0
   return __builtin_wasm_extract_lane_f16x8(a, i);
 }
+
+f16x8 min_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.minimum.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_min_f16x8(a, b);
+}
+
+f16x8 max_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.maximum.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_max_f16x8(a, b);
+}
+
+f16x8 pmin_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_pmin_f16x8(a, b);
+}
+
+f16x8 pmax_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_pmax_f16x8(a, b);
+}
 __externref_t externref_null() {
   return __builtin_wasm_ref_null_extern();
   // WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern()
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f9f16498bb390c..4beab9d091581b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -145,6 +145,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTruncStoreAction(T, MVT::f16, Expand);
   }
 
+  if (Subtarget->hasHalfPrecision()) {
+    setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
+  }
+
   // Expand unavailable integer operations.
   for (auto Op :
        {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 558e3d859dcd84..baf15ccdbe9edb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -16,33 +16,34 @@
 multiclass ABSTRACT_SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                            list<dag> pattern_r, string asmstr_r,
                            string asmstr_s, bits<32> simdop,
-                           Predicate simd_level> {
+                           list<Predicate> reqs> {
   defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
               !if(!ge(simdop, 0x100),
                   !or(0xfd0000, !and(0xffff, simdop)),
                   !or(0xfd00, !and(0xff, simdop)))>,
-            Requires<[simd_level]>;
+            Requires<reqs>;
 }
 
 multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                   list<dag> pattern_r, string asmstr_r = "",
-                  string asmstr_s = "", bits<32> simdop = -1> {
+                  string asmstr_s = "", bits<32> simdop = -1,
+                  list<Predicate> reqs = []> {
   defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
-                            asmstr_s, simdop, HasSIMD128>;
+                            asmstr_s, simdop, !listconcat([HasSIMD128], reqs)>;
 }
 
 multiclass RELAXED_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                      list<dag> pattern_r, string asmstr_r = "",
                      string asmstr_s = "", bits<32> simdop = -1> {
   defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
-                            asmstr_s, simdop, HasRelaxedSIMD>;
+                            asmstr_s, simdop, [HasRelaxedSIMD]>;
 }
 
 multiclass HALF_PRECISION_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                             list<dag> pattern_r, string asmstr_r = "",
                             string asmstr_s = "", bits<32> simdop = -1> {
   defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
-                            asmstr_s, simdop, HasHalfPrecision>;
+                            asmstr_s, simdop, [HasHalfPrecision]>;
 }
 
 
@@ -152,6 +153,19 @@ def F64x2 : Vec {
   let prefix = "f64x2";
 }
 
+def F16x8 : Vec {
+ let vt = v8f16;
+ let int_vt = v8i16;
+ let lane_vt = f32;
+ let lane_rc = F32;
+ let lane_bits = 16;
+ let lane_idx = LaneIdx8;
+ let lane_load = int_wasm_loadf16_f32;
+ let splat = PatFrag<(ops node:$x), (v8f16 (splat_vector (f16 $x)))>;
+ let prefix = "f16x8";
+}
+
+// TODO: Include F16x8 here when half precision is better supported.
 defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
 defvar IntVecs = [I8x16, I16x8, I32x4, I64x2];
 
@@ -781,13 +795,19 @@ def : Pat<(v2i64 (nodes[0] (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
 // Bitwise operations
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDBinary<Vec vec, SDPatternOperator node, string name, bits<32> simdop> {
+multiclass SIMDBinary<Vec vec, SDPatternOperator node, string name,
+                      bits<32> simdop, list<Predicate> reqs = []> {
   defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
                       (outs), (ins),
                       [(set (vec.vt V128:$dst),
                         (node (vec.vt V128:$lhs), (vec.vt V128:$rhs)))],
                       vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
-                      vec.prefix#"."#name, simdop>;
+                      vec.prefix#"."#name, simdop, reqs>;
+}
+
+multiclass HalfPrecisionBinary<Vec vec, SDPatternOperator node, string name,
+                               bits<32> simdop> {
+  defm "" : SIMDBinary<vec, node, name, simdop, [HasHalfPrecision]>;
 }
 
 multiclass SIMDBitwise<SDPatternOperator node, string name, bits<32> simdop,
@@ -1199,6 +1219,7 @@ def : Pat<(v2f64 (froundeven (v2f64 V128:$src))), (NEAREST_F64x2 V128:$src)>;
 multiclass SIMDBinaryFP<SDPatternOperator node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<F32x4, node, name, baseInst>;
   defm "" : SIMDBinary<F64x2, node, name, !add(baseInst, 12)>;
+  defm "" : HalfPrecisionBinary<F16x8, node, name, !add(baseInst, 80)>;
 }
 
 // Addition: add
@@ -1242,7 +1263,7 @@ defm PMAX : SIMDBinaryFP<pmax, "pmax", 235>;
 // Also match the pmin/pmax cases where the operands are int vectors (but the
 // comparison is still a floating point comparison). This can happen when using
 // the wasm_simd128.h intrinsics because v128_t is an integer vector.
-foreach vec = [F32x4, F64x2] in {
+foreach vec = [F32x4, F64x2, F16x8] in {
 defvar pmin = !cast<NI>("PMIN_"#vec);
 defvar pmax = !cast<NI>("PMAX_"#vec);
 def : Pat<(vec.int_vt (vselect
@@ -1266,6 +1287,10 @@ def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
           (PMIN_F64x2 V128:$lhs, V128:$rhs)>;
 def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
           (PMAX_F64x2 V128:$lhs, V128:$rhs)>;
+def : Pat<(v8f16 (int_wasm_pmin (v8f16 V128:$lhs), (v8f16 V128:$rhs))),
+          (PMIN_F16x8 V128:$lhs, V128:$rhs)>;
+def : Pat<(v8f16 (int_wasm_pmax (v8f16 V128:$lhs), (v8f16 V128:$rhs))),
+          (PMAX_F16x8 V128:$lhs, V128:$rhs)>;
 
 //===----------------------------------------------------------------------===//
 // Conversions
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index d9d3f6be800fdd..73ccea8d652db8 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -35,3 +35,71 @@ define float @extract_lane_v8f16(<8 x half> %v) {
   %r = call float @llvm.wasm.extract.lane.f16x8(<8 x half> %v, i32 1)
   ret float %r
 }
+
+; CHECK-LABEL: add_v8f16:
+; CHECK:       f16x8.add $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @add_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fadd <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: sub_v8f16:
+; CHECK:       f16x8.sub $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @sub_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fsub <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: mul_v8f16:
+; CHECK:       f16x8.mul $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @mul_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fmul <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: div_v8f16:
+; CHECK:       f16x8.div $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @div_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fdiv <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: min_intrinsic_v8f16:
+; CHECK:       f16x8.min $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @min_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) {
+  %a = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y)
+  ret <8 x half> %a
+}
+
+; CHECK-LABEL: max_intrinsic_v8f16:
+; CHECK:       f16x8.max $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @max_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) {
+  %a = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y)
+  ret <8 x half> %a
+}
+
+; CHECK-LABEL: pmin_intrinsic_v8f16:
+; CHECK:       f16x8.pmin $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.wasm.pmin.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @pmin_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) {
+  %v = call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %v
+}
+
+; CHECK-LABEL: pmax_intrinsic_v8f16:
+; CHECK:       f16x8.pmax $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.wasm.pmax.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @pmax_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) {
+  %v = call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %v
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index d397188a9882ea..113a23da776fa9 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -851,4 +851,28 @@ main:
     # CHECK: f16x8.extract_lane 1 # encoding: [0xfd,0xa1,0x02,0x01]
     f16x8.extract_lane 1
 
+    # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02]
+    f16x8.add
+
+    # CHECK: f16x8.sub # encoding: [0xfd,0xb5,0x02]
+    f16x8.sub
+
+    # CHECK: f16x8.mul # encoding: [0xfd,0xb6,0x02]
+    f16x8.mul
+
+    # CHECK: f16x8.div # encoding: [0xfd,0xb7,0x02]
+    f16x8.div
+
+    # CHECK: f16x8.min # encoding: [0xfd,0xb8,0x02]
+    f16x8.min
+
+    # CHECK: f16x8.max # encoding: [0xfd,0xb9,0x02]
+    f16x8.max
+
+    # CHECK: f16x8.pmin # encoding: [0xfd,0xba,0x02]
+    f16x8.pmin
+
+    # CHECK: f16x8.pmax # encoding: [0xfd,0xbb,0x02]
+    f16x8.pmax
+
     end_function

From 0edc97f119f3ac3ff96b11183fe5c001a48a9a8d Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed@bougacha.org>
Date: Tue, 28 May 2024 16:39:09 -0700
Subject: [PATCH 86/89] [IR][AArch64][PAC] Add "ptrauth(...)" Constant to
 represent signed pointers. (#85738)

This defines a new kind of IR Constant that represents a ptrauth signed
pointer, as used in AArch64 PAuth.

It allows representing most kinds of signed pointer constants used thus
far in the llvm ptrauth implementations, notably those used in the
Darwin and ELF ABIs being implemented for c/c++.  These signed pointer
constants are then lowered to ELF/MachO relocations.

These can be simply thought of as a constant `llvm.ptrauth.sign`, with
the interesting addition of discriminator computation: the `ptrauth`
constant can also represent a combined blend, when both address and
integer discriminator operands are used.  Both operands are otherwise
optional, with default values 0/null.
---
 llvm/docs/LangRef.rst                         |  34 +++++
 llvm/docs/PointerAuth.md                      |  22 ++++
 llvm/include/llvm/AsmParser/LLToken.h         |   1 +
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |   1 +
 llvm/include/llvm/IR/Constants.h              |  66 ++++++++++
 llvm/include/llvm/IR/Value.def                |   1 +
 llvm/lib/Analysis/ValueTracking.cpp           |   4 +
 llvm/lib/AsmParser/LLLexer.cpp                |   1 +
 llvm/lib/AsmParser/LLParser.cpp               |  54 ++++++++
 llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp   |   1 +
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |  25 +++-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |   6 +
 llvm/lib/IR/AsmWriter.cpp                     |  21 +++
 llvm/lib/IR/Constants.cpp                     | 121 ++++++++++++++++++
 llvm/lib/IR/ConstantsContext.h                |  47 +++++++
 llvm/lib/IR/LLVMContextImpl.h                 |   2 +
 llvm/lib/IR/Verifier.cpp                      |  23 ++++
 llvm/test/Assembler/invalid-ptrauth-const1.ll |   6 +
 llvm/test/Assembler/invalid-ptrauth-const2.ll |   6 +
 llvm/test/Assembler/invalid-ptrauth-const3.ll |   6 +
 llvm/test/Assembler/invalid-ptrauth-const4.ll |   6 +
 llvm/test/Assembler/invalid-ptrauth-const5.ll |   6 +
 llvm/test/Assembler/ptrauth-const.ll          |  24 ++++
 llvm/test/Bitcode/compatibility.ll            |   4 +
 llvm/utils/vim/syntax/llvm.vim                |   1 +
 25 files changed, 488 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const1.ll
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const2.ll
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const3.ll
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const4.ll
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const5.ll
 create mode 100644 llvm/test/Assembler/ptrauth-const.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 614dd98b013b35..7b64c477d13c7f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4754,6 +4754,40 @@ reference to the CFI jump table in the ``LowerTypeTests`` pass. These constants
 may be useful in low-level programs, such as operating system kernels, which
 need to refer to the actual function body.
 
+.. _ptrauth_constant:
+
+Pointer Authentication Constants
+--------------------------------
+
+``ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?)``
+
+A '``ptrauth``' constant represents a pointer with a cryptographic
+authentication signature embedded into some bits, as described in the
+`Pointer Authentication <PointerAuth.html>`__ document.
+
+A '``ptrauth``' constant is simply a constant equivalent to the
+``llvm.ptrauth.sign`` intrinsic, potentially fed by a discriminator
+``llvm.ptrauth.blend`` if needed.
+
+Its type is the same as the first argument.  An integer constant discriminator
+and an address discriminator may be optionally specified.  Otherwise, they have
+values ``i64 0`` and ``ptr null``.
+
+If the address discriminator is ``null`` then the expression is equivalent to
+
+.. code-block:: llvm
+
+    %tmp = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 DISC)
+    %val = inttoptr i64 %tmp to ptr
+
+Otherwise, the expression is equivalent to:
+
+.. code-block:: llvm
+
+    %tmp1 = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr ADDRDISC to i64), i64 DISC)
+    %tmp2 = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 %tmp1)
+    %val = inttoptr i64 %tmp2 to ptr
+
 .. _constantexprs:
 
 Constant Expressions
diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md
index a8d2b4d8f5f0bd..cf2cc6305f130f 100644
--- a/llvm/docs/PointerAuth.md
+++ b/llvm/docs/PointerAuth.md
@@ -16,6 +16,7 @@ For more details, see the clang documentation page for
 At the IR level, it is represented using:
 
 * a [set of intrinsics](#intrinsics) (to sign/authenticate pointers)
+* a [signed pointer constant](#constant) (to sign globals)
 * a [call operand bundle](#operand-bundle) (to authenticate called pointers)
 
 The current implementation leverages the
@@ -225,6 +226,27 @@ with a pointer address discriminator, in a way that is specified by the target
 implementation.
 
 
+### Constant
+
+[Intrinsics](#intrinsics) can be used to produce signed pointers dynamically,
+in code, but not for signed pointers referenced by constants, in, e.g., global
+initializers.
+
+The latter are represented using a
+[``ptrauth`` constant](https://llvm.org/docs/LangRef.html#ptrauth-constant),
+which describes an authenticated relocation producing a signed pointer.
+
+```llvm
+ptrauth (ptr CST, i32 KEY, i64 DISC, ptr ADDRDISC)
+```
+
+is equivalent to:
+
+```llvm
+  %disc = call i64 @llvm.ptrauth.blend(i64 ptrtoint(ptr ADDRDISC to i64), i64 DISC)
+  %signedval = call i64 @llvm.ptrauth.sign(ptr CST, i32 KEY, i64 %disc)
+```
+
 ### Operand Bundle
 
 Function pointers used as indirect call targets can be signed when materialized,
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index df61ec6ed30e0b..69821c22dcd619 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -346,6 +346,7 @@ enum Kind {
   kw_blockaddress,
   kw_dso_local_equivalent,
   kw_no_cfi,
+  kw_ptrauth,
 
   kw_freeze,
 
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index d3b9e96520f88a..9999aee61528e5 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -413,6 +413,7 @@ enum ConstantsCodes {
                                       //                 asmstr,conststr]
   CST_CODE_CE_GEP_WITH_INRANGE = 31,  // [opty, flags, range, n x operands]
   CST_CODE_CE_GEP = 32,               // [opty, flags, n x operands]
+  CST_CODE_PTRAUTH = 33,              // [ptr, key, disc, addrdisc]
 };
 
 /// CastOpcodes - These are values used in the bitcode files to encode which
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index a1e5005a9d1da5..86f6be7985a23f 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -1008,6 +1008,72 @@ struct OperandTraits<NoCFIValue> : public FixedNumOperandTraits<NoCFIValue, 1> {
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(NoCFIValue, Value)
 
+/// A signed pointer, in the ptrauth sense.
+class ConstantPtrAuth final : public Constant {
+  friend struct ConstantPtrAuthKeyType;
+  friend class Constant;
+
+  ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc,
+                  Constant *AddrDisc);
+
+  void *operator new(size_t s) { return User::operator new(s, 4); }
+
+  void destroyConstantImpl();
+  Value *handleOperandChangeImpl(Value *From, Value *To);
+
+public:
+  /// Return a pointer signed with the specified parameters.
+  static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key,
+                              ConstantInt *Disc, Constant *AddrDisc);
+
+  /// Produce a new ptrauth expression signing the given value using
+  /// the same schema as is stored in one.
+  ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const;
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
+
+  /// The pointer that is signed in this ptrauth signed pointer.
+  Constant *getPointer() const { return cast<Constant>(Op<0>().get()); }
+
+  /// The Key ID, an i32 constant.
+  ConstantInt *getKey() const { return cast<ConstantInt>(Op<1>().get()); }
+
+  /// The integer discriminator, an i64 constant, or 0.
+  ConstantInt *getDiscriminator() const {
+    return cast<ConstantInt>(Op<2>().get());
+  }
+
+  /// The address discriminator if any, or the null constant.
+  /// If present, this must be a value equivalent to the storage location of
+  /// the only global-initializer user of the ptrauth signed pointer.
+  Constant *getAddrDiscriminator() const {
+    return cast<Constant>(Op<3>().get());
+  }
+
+  /// Whether there is any non-null address discriminator.
+  bool hasAddressDiscriminator() const {
+    return !getAddrDiscriminator()->isNullValue();
+  }
+
+  /// Check whether an authentication operation with key \p Key and (possibly
+  /// blended) discriminator \p Discriminator is known to be compatible with
+  /// this ptrauth signed pointer.
+  bool isKnownCompatibleWith(const Value *Key, const Value *Discriminator,
+                             const DataLayout &DL) const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const Value *V) {
+    return V->getValueID() == ConstantPtrAuthVal;
+  }
+};
+
+template <>
+struct OperandTraits<ConstantPtrAuth>
+    : public FixedNumOperandTraits<ConstantPtrAuth, 4> {};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPtrAuth, Constant)
+
 //===----------------------------------------------------------------------===//
 /// A constant value that is initialized with an expression using
 /// other constant values.
diff --git a/llvm/include/llvm/IR/Value.def b/llvm/include/llvm/IR/Value.def
index 61f7a87666d094..3ece66a529e125 100644
--- a/llvm/include/llvm/IR/Value.def
+++ b/llvm/include/llvm/IR/Value.def
@@ -81,6 +81,7 @@ HANDLE_CONSTANT(BlockAddress)
 HANDLE_CONSTANT(ConstantExpr)
 HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(DSOLocalEquivalent)
 HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(NoCFIValue)
+HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(ConstantPtrAuth)
 
 // ConstantAggregate.
 HANDLE_CONSTANT(ConstantArray)
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 3baa8ede28ffaf..08138a5e2f2d9d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3140,6 +3140,10 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts,
       return true;
     }
 
+    // Constant ptrauth can be null, iff the base pointer can be.
+    if (auto *CPA = dyn_cast<ConstantPtrAuth>(V))
+      return isKnownNonZero(CPA->getPointer(), DemandedElts, Q, Depth);
+
     // A global variable in address space 0 is non null unless extern weak
     // or an absolute symbol reference. Other address spaces may have null as a
     // valid address for a global, so we can't assume anything.
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 20a1bd29577124..d3ab306904da12 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -710,6 +710,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(blockaddress);
   KEYWORD(dso_local_equivalent);
   KEYWORD(no_cfi);
+  KEYWORD(ptrauth);
 
   // Metadata types.
   KEYWORD(distinct);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5d2056d2085672..df0827996396ef 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4046,6 +4046,60 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
     ID.NoCFI = true;
     return false;
   }
+  case lltok::kw_ptrauth: {
+    // ValID ::= 'ptrauth' '(' ptr @foo ',' i32 <key>
+    //                         (',' i64 <disc> (',' ptr addrdisc)? )? ')'
+    Lex.Lex();
+
+    Constant *Ptr, *Key;
+    Constant *Disc = nullptr, *AddrDisc = nullptr;
+
+    if (parseToken(lltok::lparen,
+                   "expected '(' in constant ptrauth expression") ||
+        parseGlobalTypeAndValue(Ptr) ||
+        parseToken(lltok::comma,
+                   "expected comma in constant ptrauth expression") ||
+        parseGlobalTypeAndValue(Key))
+      return true;
+    // If present, parse the optional disc/addrdisc.
+    if (EatIfPresent(lltok::comma))
+      if (parseGlobalTypeAndValue(Disc) ||
+          (EatIfPresent(lltok::comma) && parseGlobalTypeAndValue(AddrDisc)))
+        return true;
+    if (parseToken(lltok::rparen,
+                   "expected ')' in constant ptrauth expression"))
+      return true;
+
+    if (!Ptr->getType()->isPointerTy())
+      return error(ID.Loc, "constant ptrauth base pointer must be a pointer");
+
+    auto *KeyC = dyn_cast<ConstantInt>(Key);
+    if (!KeyC || KeyC->getBitWidth() != 32)
+      return error(ID.Loc, "constant ptrauth key must be i32 constant");
+
+    ConstantInt *DiscC = nullptr;
+    if (Disc) {
+      DiscC = dyn_cast<ConstantInt>(Disc);
+      if (!DiscC || DiscC->getBitWidth() != 64)
+        return error(
+            ID.Loc,
+            "constant ptrauth integer discriminator must be i64 constant");
+    } else {
+      DiscC = ConstantInt::get(Type::getInt64Ty(Context), 0);
+    }
+
+    if (AddrDisc) {
+      if (!AddrDisc->getType()->isPointerTy())
+        return error(
+            ID.Loc, "constant ptrauth address discriminator must be a pointer");
+    } else {
+      AddrDisc = ConstantPointerNull::get(PointerType::get(Context, 0));
+    }
+
+    ID.ConstantVal = ConstantPtrAuth::get(Ptr, KeyC, DiscC, AddrDisc);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
 
   case lltok::kw_trunc:
   case lltok::kw_bitcast:
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index c085c715179ba6..b7ed9cdf631454 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -222,6 +222,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(CST_CODE, CE_UNOP)
       STRINGIFY_CODE(CST_CODE, DSO_LOCAL_EQUIVALENT)
       STRINGIFY_CODE(CST_CODE, NO_CFI_VALUE)
+      STRINGIFY_CODE(CST_CODE, PTRAUTH)
     case bitc::CST_CODE_BLOCKADDRESS:
       return "CST_CODE_BLOCKADDRESS";
       STRINGIFY_CODE(CST_CODE, DATA)
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 32b9a033173e93..aee627bbde0bf5 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -517,7 +517,8 @@ class BitcodeConstant final : public Value,
   static constexpr uint8_t NoCFIOpcode = 252;
   static constexpr uint8_t DSOLocalEquivalentOpcode = 251;
   static constexpr uint8_t BlockAddressOpcode = 250;
-  static constexpr uint8_t FirstSpecialOpcode = BlockAddressOpcode;
+  static constexpr uint8_t ConstantPtrAuthOpcode = 249;
+  static constexpr uint8_t FirstSpecialOpcode = ConstantPtrAuthOpcode;
 
   // Separate struct to make passing different number of parameters to
   // BitcodeConstant::create() more convenient.
@@ -1562,6 +1563,18 @@ Expected<Value *> BitcodeReader::materializeValue(unsigned StartValID,
         C = ConstantExpr::get(BC->Opcode, ConstOps[0], ConstOps[1], BC->Flags);
       } else {
         switch (BC->Opcode) {
+        case BitcodeConstant::ConstantPtrAuthOpcode: {
+          auto *Key = dyn_cast<ConstantInt>(ConstOps[1]);
+          if (!Key)
+            return error("ptrauth key operand must be ConstantInt");
+
+          auto *Disc = dyn_cast<ConstantInt>(ConstOps[2]);
+          if (!Disc)
+            return error("ptrauth disc operand must be ConstantInt");
+
+          C = ConstantPtrAuth::get(ConstOps[0], Key, Disc, ConstOps[3]);
+          break;
+        }
         case BitcodeConstant::NoCFIOpcode: {
           auto *GV = dyn_cast<GlobalValue>(ConstOps[0]);
           if (!GV)
@@ -3644,6 +3657,16 @@ Error BitcodeReader::parseConstants() {
                                   Record[1]);
       break;
     }
+    case bitc::CST_CODE_PTRAUTH: {
+      if (Record.size() < 4)
+        return error("Invalid ptrauth record");
+      // Ptr, Key, Disc, AddrDisc
+      V = BitcodeConstant::create(Alloc, CurTy,
+                                  BitcodeConstant::ConstantPtrAuthOpcode,
+                                  {(unsigned)Record[0], (unsigned)Record[1],
+                                   (unsigned)Record[2], (unsigned)Record[3]});
+      break;
+    }
     }
 
     assert(V->getType() == getTypeByID(CurTyID) && "Incorrect result type ID");
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 3d653fe4458f4b..046dad5721c4ce 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2848,6 +2848,12 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
       Code = bitc::CST_CODE_NO_CFI_VALUE;
       Record.push_back(VE.getTypeID(NC->getGlobalValue()->getType()));
       Record.push_back(VE.getValueID(NC->getGlobalValue()));
+    } else if (const auto *CPA = dyn_cast<ConstantPtrAuth>(C)) {
+      Code = bitc::CST_CODE_PTRAUTH;
+      Record.push_back(VE.getValueID(CPA->getPointer()));
+      Record.push_back(VE.getValueID(CPA->getKey()));
+      Record.push_back(VE.getValueID(CPA->getDiscriminator()));
+      Record.push_back(VE.getValueID(CPA->getAddrDiscriminator()));
     } else {
 #ifndef NDEBUG
       C->dump();
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index ced5d78f994ab5..8b1a21f962b08f 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1594,6 +1594,27 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
+  if (const ConstantPtrAuth *CPA = dyn_cast<ConstantPtrAuth>(CV)) {
+    Out << "ptrauth (";
+
+    // ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?)
+    unsigned NumOpsToWrite = 2;
+    if (!CPA->getOperand(2)->isNullValue())
+      NumOpsToWrite = 3;
+    if (!CPA->getOperand(3)->isNullValue())
+      NumOpsToWrite = 4;
+
+    ListSeparator LS;
+    for (unsigned i = 0, e = NumOpsToWrite; i != e; ++i) {
+      Out << LS;
+      WriterCtx.TypePrinter->print(CPA->getOperand(i)->getType(), Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, CPA->getOperand(i), WriterCtx);
+    }
+    Out << ')';
+    return;
+  }
+
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
     Type *ETy = CA->getType()->getElementType();
     Out << '[';
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index cfb89d557db479..119fcb4fa03461 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -550,6 +550,9 @@ void llvm::deleteConstant(Constant *C) {
   case Constant::NoCFIValueVal:
     delete static_cast<NoCFIValue *>(C);
     break;
+  case Constant::ConstantPtrAuthVal:
+    delete static_cast<ConstantPtrAuth *>(C);
+    break;
   case Constant::UndefValueVal:
     delete static_cast<UndefValue *>(C);
     break;
@@ -2015,6 +2018,124 @@ Value *NoCFIValue::handleOperandChangeImpl(Value *From, Value *To) {
   return nullptr;
 }
 
+//---- ConstantPtrAuth::get() implementations.
+//
+
+ConstantPtrAuth *ConstantPtrAuth::get(Constant *Ptr, ConstantInt *Key,
+                                      ConstantInt *Disc, Constant *AddrDisc) {
+  Constant *ArgVec[] = {Ptr, Key, Disc, AddrDisc};
+  ConstantPtrAuthKeyType MapKey(ArgVec);
+  LLVMContextImpl *pImpl = Ptr->getContext().pImpl;
+  return pImpl->ConstantPtrAuths.getOrCreate(Ptr->getType(), MapKey);
+}
+
+ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const {
+  return get(Pointer, getKey(), getDiscriminator(), getAddrDiscriminator());
+}
+
+ConstantPtrAuth::ConstantPtrAuth(Constant *Ptr, ConstantInt *Key,
+                                 ConstantInt *Disc, Constant *AddrDisc)
+    : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, &Op<0>(), 4) {
+  assert(Ptr->getType()->isPointerTy());
+  assert(Key->getBitWidth() == 32);
+  assert(Disc->getBitWidth() == 64);
+  assert(AddrDisc->getType()->isPointerTy());
+  setOperand(0, Ptr);
+  setOperand(1, Key);
+  setOperand(2, Disc);
+  setOperand(3, AddrDisc);
+}
+
+/// Remove the constant from the constant table.
+void ConstantPtrAuth::destroyConstantImpl() {
+  getType()->getContext().pImpl->ConstantPtrAuths.remove(this);
+}
+
+Value *ConstantPtrAuth::handleOperandChangeImpl(Value *From, Value *ToV) {
+  assert(isa<Constant>(ToV) && "Cannot make Constant refer to non-constant!");
+  Constant *To = cast<Constant>(ToV);
+
+  SmallVector<Constant *, 4> Values;
+  Values.reserve(getNumOperands());
+
+  unsigned NumUpdated = 0;
+
+  Use *OperandList = getOperandList();
+  unsigned OperandNo = 0;
+  for (Use *O = OperandList, *E = OperandList + getNumOperands(); O != E; ++O) {
+    Constant *Val = cast<Constant>(O->get());
+    if (Val == From) {
+      OperandNo = (O - OperandList);
+      Val = To;
+      ++NumUpdated;
+    }
+    Values.push_back(Val);
+  }
+
+  return getContext().pImpl->ConstantPtrAuths.replaceOperandsInPlace(
+      Values, this, From, To, NumUpdated, OperandNo);
+}
+
+bool ConstantPtrAuth::isKnownCompatibleWith(const Value *Key,
+                                            const Value *Discriminator,
+                                            const DataLayout &DL) const {
+  // If the keys are different, there's no chance for this to be compatible.
+  if (getKey() != Key)
+    return false;
+
+  // We can have 3 kinds of discriminators:
+  // - simple, integer-only:    `i64 x, ptr null` vs. `i64 x`
+  // - address-only:            `i64 0, ptr p` vs. `ptr p`
+  // - blended address/integer: `i64 x, ptr p` vs. `@llvm.ptrauth.blend(p, x)`
+
+  // If this constant has a simple discriminator (integer, no address), easy:
+  // it's compatible iff the provided full discriminator is also a simple
+  // discriminator, identical to our integer discriminator.
+  if (!hasAddressDiscriminator())
+    return getDiscriminator() == Discriminator;
+
+  // Otherwise, we can isolate address and integer discriminator components.
+  const Value *AddrDiscriminator = nullptr;
+
+  // This constant may or may not have an integer discriminator (instead of 0).
+  if (!getDiscriminator()->isNullValue()) {
+    // If it does, there's an implicit blend.  We need to have a matching blend
+    // intrinsic in the provided full discriminator.
+    if (!match(Discriminator,
+               m_Intrinsic<Intrinsic::ptrauth_blend>(
+                   m_Value(AddrDiscriminator), m_Specific(getDiscriminator()))))
+      return false;
+  } else {
+    // Otherwise, interpret the provided full discriminator as address-only.
+    AddrDiscriminator = Discriminator;
+  }
+
+  // Either way, we can now focus on comparing the address discriminators.
+
+  // Discriminators are i64, so the provided addr disc may be a ptrtoint.
+  if (auto *Cast = dyn_cast<PtrToIntOperator>(AddrDiscriminator))
+    AddrDiscriminator = Cast->getPointerOperand();
+
+  // Beyond that, we're only interested in compatible pointers.
+  if (getAddrDiscriminator()->getType() != AddrDiscriminator->getType())
+    return false;
+
+  // These are often the same constant GEP, making them trivially equivalent.
+  if (getAddrDiscriminator() == AddrDiscriminator)
+    return true;
+
+  // Finally, they may be equivalent base+offset expressions.
+  APInt Off1(DL.getIndexTypeSizeInBits(getAddrDiscriminator()->getType()), 0);
+  auto *Base1 = getAddrDiscriminator()->stripAndAccumulateConstantOffsets(
+      DL, Off1, /*AllowNonInbounds=*/true);
+
+  APInt Off2(DL.getIndexTypeSizeInBits(AddrDiscriminator->getType()), 0);
+  auto *Base2 = AddrDiscriminator->stripAndAccumulateConstantOffsets(
+      DL, Off2, /*AllowNonInbounds=*/true);
+
+  return Base1 == Base2 && Off1 == Off2;
+}
+
 //---- ConstantExpr::get() implementations.
 //
 
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index 7067d0d121117b..5153880b5cab64 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -23,6 +23,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -286,6 +287,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value)
 template <class ConstantClass> struct ConstantAggrKeyType;
 struct InlineAsmKeyType;
 struct ConstantExprKeyType;
+struct ConstantPtrAuthKeyType;
 
 template <class ConstantClass> struct ConstantInfo;
 template <> struct ConstantInfo<ConstantExpr> {
@@ -308,6 +310,10 @@ template <> struct ConstantInfo<ConstantVector> {
   using ValType = ConstantAggrKeyType<ConstantVector>;
   using TypeClass = VectorType;
 };
+template <> struct ConstantInfo<ConstantPtrAuth> {
+  using ValType = ConstantPtrAuthKeyType;
+  using TypeClass = Type;
+};
 
 template <class ConstantClass> struct ConstantAggrKeyType {
   ArrayRef<Constant *> Operands;
@@ -536,6 +542,47 @@ struct ConstantExprKeyType {
   }
 };
 
+struct ConstantPtrAuthKeyType {
+  ArrayRef<Constant *> Operands;
+
+  ConstantPtrAuthKeyType(ArrayRef<Constant *> Operands) : Operands(Operands) {}
+
+  ConstantPtrAuthKeyType(ArrayRef<Constant *> Operands, const ConstantPtrAuth *)
+      : Operands(Operands) {}
+
+  ConstantPtrAuthKeyType(const ConstantPtrAuth *C,
+                         SmallVectorImpl<Constant *> &Storage) {
+    assert(Storage.empty() && "Expected empty storage");
+    for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I)
+      Storage.push_back(cast<Constant>(C->getOperand(I)));
+    Operands = Storage;
+  }
+
+  bool operator==(const ConstantPtrAuthKeyType &X) const {
+    return Operands == X.Operands;
+  }
+
+  bool operator==(const ConstantPtrAuth *C) const {
+    if (Operands.size() != C->getNumOperands())
+      return false;
+    for (unsigned I = 0, E = Operands.size(); I != E; ++I)
+      if (Operands[I] != C->getOperand(I))
+        return false;
+    return true;
+  }
+
+  unsigned getHash() const {
+    return hash_combine_range(Operands.begin(), Operands.end());
+  }
+
+  using TypeClass = typename ConstantInfo<ConstantPtrAuth>::TypeClass;
+
+  ConstantPtrAuth *create(TypeClass *Ty) const {
+    return new ConstantPtrAuth(Operands[0], cast<ConstantInt>(Operands[1]),
+                               cast<ConstantInt>(Operands[2]), Operands[3]);
+  }
+};
+
 // Free memory for a given constant.  Assumes the constant has already been
 // removed from all relevant maps.
 void deleteConstant(Constant *C);
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 399fe0dad26c73..392e0d16f1761e 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1562,6 +1562,8 @@ class LLVMContextImpl {
 
   DenseMap<const GlobalValue *, NoCFIValue *> NoCFIValues;
 
+  ConstantUniqueMap<ConstantPtrAuth> ConstantPtrAuths;
+
   ConstantUniqueMap<ConstantExpr> ExprConstants;
 
   ConstantUniqueMap<InlineAsm> InlineAsms;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 50f8d6ec842017..684e54444621b5 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -629,6 +629,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
 
   void visitConstantExprsRecursively(const Constant *EntryC);
   void visitConstantExpr(const ConstantExpr *CE);
+  void visitConstantPtrAuth(const ConstantPtrAuth *CPA);
   void verifyInlineAsmCall(const CallBase &Call);
   void verifyStatepoint(const CallBase &Call);
   void verifyFrameRecoverIndices();
@@ -2422,6 +2423,9 @@ void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
     if (const auto *CE = dyn_cast<ConstantExpr>(C))
       visitConstantExpr(CE);
 
+    if (const auto *CPA = dyn_cast<ConstantPtrAuth>(C))
+      visitConstantPtrAuth(CPA);
+
     if (const auto *GV = dyn_cast<GlobalValue>(C)) {
       // Global Values get visited separately, but we do need to make sure
       // that the global value is in the correct module
@@ -2449,6 +2453,23 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
           "Invalid bitcast", CE);
 }
 
+void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) {
+  Check(CPA->getPointer()->getType()->isPointerTy(),
+        "signed ptrauth constant base pointer must have pointer type");
+
+  Check(CPA->getType() == CPA->getPointer()->getType(),
+        "signed ptrauth constant must have same type as its base pointer");
+
+  Check(CPA->getKey()->getBitWidth() == 32,
+        "signed ptrauth constant key must be i32 constant integer");
+
+  Check(CPA->getAddrDiscriminator()->getType()->isPointerTy(),
+        "signed ptrauth constant address discriminator must be a pointer");
+
+  Check(CPA->getDiscriminator()->getBitWidth() == 64,
+        "signed ptrauth constant discriminator must be i64 constant integer");
+}
+
 bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
   // There shouldn't be more attribute sets than there are parameters plus the
   // function and return value.
@@ -5090,6 +5111,8 @@ void Verifier::visitInstruction(Instruction &I) {
     } else if (isa<InlineAsm>(I.getOperand(i))) {
       Check(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i),
             "Cannot take the address of an inline asm!", &I);
+    } else if (auto *CPA = dyn_cast<ConstantPtrAuth>(I.getOperand(i))) {
+      visitConstantExprsRecursively(CPA);
     } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(I.getOperand(i))) {
       if (CE->getType()->isPtrOrPtrVectorTy()) {
         // If we have a ConstantExpr pointer, we need to see if it came from an
diff --git a/llvm/test/Assembler/invalid-ptrauth-const1.ll b/llvm/test/Assembler/invalid-ptrauth-const1.ll
new file mode 100644
index 00000000000000..fba2e230782382
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const1.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth base pointer must be a pointer
+@auth_var = global ptr ptrauth (i32 42, i32 0)
diff --git a/llvm/test/Assembler/invalid-ptrauth-const2.ll b/llvm/test/Assembler/invalid-ptrauth-const2.ll
new file mode 100644
index 00000000000000..4499c42601c99e
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const2.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth key must be i32 constant
+@auth_var = global ptr ptrauth (ptr @var, i32 ptrtoint (ptr @var to i32))
diff --git a/llvm/test/Assembler/invalid-ptrauth-const3.ll b/llvm/test/Assembler/invalid-ptrauth-const3.ll
new file mode 100644
index 00000000000000..3f2688d92a0010
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const3.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth address discriminator must be a pointer
+@auth_var = global ptr ptrauth (ptr @var, i32 2, i64 65535, i8 0)
diff --git a/llvm/test/Assembler/invalid-ptrauth-const4.ll b/llvm/test/Assembler/invalid-ptrauth-const4.ll
new file mode 100644
index 00000000000000..843a220458a61b
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const4.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth integer discriminator must be i64 constant
+@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr null, i64 ptrtoint (ptr @var to i64))
diff --git a/llvm/test/Assembler/invalid-ptrauth-const5.ll b/llvm/test/Assembler/invalid-ptrauth-const5.ll
new file mode 100644
index 00000000000000..9b47f6f5f423fc
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const5.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth integer discriminator must be i64 constant
+@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr @var))
diff --git a/llvm/test/Assembler/ptrauth-const.ll b/llvm/test/Assembler/ptrauth-const.ll
new file mode 100644
index 00000000000000..94d35146d5927b
--- /dev/null
+++ b/llvm/test/Assembler/ptrauth-const.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: @basic = global ptr ptrauth (ptr @var, i32 0)
+@basic = global ptr ptrauth (ptr @var, i32 0)
+
+; CHECK: @keyed = global ptr ptrauth (ptr @var, i32 3)
+@keyed = global ptr ptrauth (ptr @var, i32 3)
+
+; CHECK: @intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1)
+@intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1)
+
+; CHECK: @addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc)
+@addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc)
+
+
+@var1 = addrspace(1) global i32 0
+
+; CHECK: @addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0)
+@addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0)
+
+; CHECK: @addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc)
+@addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc)
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index b374924516d665..2a846e036924c7 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -217,6 +217,10 @@ declare void @g.f1()
 ; CHECK: @g.sanitize_address_dyninit = global i32 0, sanitize_address_dyninit
 ; CHECK: @g.sanitize_multiple = global i32 0, sanitize_memtag, sanitize_address_dyninit
 
+; ptrauth constant
+@auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535, ptr null)
+; CHECK: @auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535)
+
 ;; Aliases
 ; Format: @<Name> = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal]
 ;                   [unnamed_addr] alias <AliaseeTy> @<Aliasee>
diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim
index d86e3d1ddbc27f..905d696400ca37 100644
--- a/llvm/utils/vim/syntax/llvm.vim
+++ b/llvm/utils/vim/syntax/llvm.vim
@@ -150,6 +150,7 @@ syn keyword llvmKeyword
       \ preallocated
       \ private
       \ protected
+      \ ptrauth
       \ ptx_device
       \ ptx_kernel
       \ readnone

From 6f529aaf666624c26715aa348955b26a684d1250 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 28 May 2024 23:37:40 +0000
Subject: [PATCH 87/89] [WebAssembly] Remove IIT_EXNREF

This was added in #93586 but caused a compilation warning and is not
used anyway.
---
 llvm/include/llvm/IR/Intrinsics.td | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index c3ac53837444ef..107442623ab7bd 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -316,7 +316,6 @@ def IIT_PPCF128 : IIT_VT<ppcf128, 52>;
 def IIT_V3 : IIT_Vec<3, 53>;
 def IIT_EXTERNREF : IIT_VT<externref, 54>;
 def IIT_FUNCREF : IIT_VT<funcref, 55>;
-def IIT_EXNREF: IIT_VT<exnref, 56>;
 def IIT_I2 : IIT_Int<2, 57>;
 def IIT_I4 : IIT_Int<4, 58>;
 def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;

From bd5cd4b837b67f8d549f072f37dd09295b4bf9f7 Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric@efcs.ca>
Date: Tue, 28 May 2024 20:01:47 -0400
Subject: [PATCH 88/89] Fix trigger for libc++ job rerunner.

Testing github actions is such a pain. I swear it should match now.
---
 .github/workflows/restart-preempted-libcxx-jobs.yaml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml
index 5682b0a4f52c3d..88924fb3cd7791 100644
--- a/.github/workflows/restart-preempted-libcxx-jobs.yaml
+++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml
@@ -11,18 +11,16 @@ name: Restart Preempted Libc++ Workflow
 
 on:
   workflow_run:
-    workflows:
-      - Build and Test libc\+\+
+    workflows: [Build and Test libc\+\+]
     types:
-      - failure
-      - canceled
+      - completed
 
 permissions:
   contents: read
 
 jobs:
   restart:
-    if: github.repository_owner == 'llvm'
+    if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure' || github.event.workflow_run.conclusion == 'cancelled')
     name: "Restart Job"
     permissions:
       statuses: read

From 5bfe4b93e15ad38f211c5dec64be0eeaa4c8e914 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 28 May 2024 20:04:41 -0400
Subject: [PATCH 89/89] [mlir][arith] Disallow casting tensor dimensions
 (#93349)

Tighten the verifier for arith cast ops to disallow changing tensor
dimensions, e.g., static to dynamic. After this change:
* `arith.cast_op %x : tensor<4xi32> to tensor<4xf32>` remains valid
* `arith.cast_op %x : tensor<4xi32> to tensor<?xf32>` becomes invalid
* `arith.cast_op %x : tensor<?xi32> to tensor<4xf32>` becomes invalid

This is mostly to simplify the op semantics. See the discussion thread
for more context:
https://discourse.llvm.org/t/rfc-remove-arith-math-ops-on-tensors/74357/63.
---
 .../include/mlir/Dialect/Arith/IR/ArithOps.td | 19 +++++++--
 mlir/test/Dialect/Arith/canonicalize.mlir     |  8 ----
 mlir/test/Dialect/Arith/invalid.mlir          | 42 ++++++++++++++++++-
 3 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 46248dad3be9e0..81ed0f924a2e2c 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -83,12 +83,25 @@ class Arith_FloatBinaryOp<string mnemonic, list<Trait> traits = []> :
                           attr-dict `:` type($result) }];
 }
 
+// Checks that tensor input and outputs have identical shapes. This is stricker
+// than the verification done in `SameOperandsAndResultShape` that allows for
+// tensor dimensions to be 'compatible' (e.g., dynamic dimensions being
+// compatible with static ones).
+def SameInputOutputTensorDims : PredOpTrait<
+    "input and output have the same tensor dimensions",
+    AllMatchSameOperatorPred<["in", "out"],
+      "(::llvm::isa<::mlir::TensorType>($_self.getType()) ?"
+      " ::llvm::cast<::mlir::TensorType>($_self.getType()).getShape() :"
+      " ::llvm::ArrayRef<int64_t>{})">>;
+
 // Base class for arithmetic cast operations. Requires a single operand and
-// result. If either is a shaped type, then the other must be of the same shape.
+// result. If either is a shaped type, then the other must be of the same
+// shape.  In the case of tensor types, this also includes the corresponding
+// operand/result dimensions being equal.
 class Arith_CastOp<string mnemonic, TypeConstraint From, TypeConstraint To,
                    list<Trait> traits = []> :
     Arith_Op<mnemonic, traits # [Pure, SameOperandsAndResultShape,
-      DeclareOpInterfaceMethods<CastOpInterface>]>,
+      SameInputOutputTensorDims, DeclareOpInterfaceMethods<CastOpInterface>]>,
     Arguments<(ins From:$in)>,
     Results<(outs To:$out)> {
   let assemblyFormat = "$in attr-dict `:` type($in) `to` type($out)";
@@ -1231,7 +1244,7 @@ def Arith_TruncIOp : Arith_IToICastOp<"trunci"> {
 
 def Arith_TruncFOp :
     Arith_Op<"truncf",
-      [Pure, SameOperandsAndResultShape,
+      [Pure, SameOperandsAndResultShape, SameInputOutputTensorDims,
        DeclareOpInterfaceMethods<ArithRoundingModeInterface>,
        DeclareOpInterfaceMethods<CastOpInterface>]>,
     Arguments<(ins FloatLike:$in,
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 1a387c20c4b297..e4f95bb0545a20 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2950,14 +2950,6 @@ func.func @unsignedExtendConstantResource() -> tensor<i16> {
   return %ext : tensor<i16>
 }
 
-// Just checks that this doesn't crash.
-// CHECK-LABEL: @signedExtendSplatAsDynamicShape
-func.func @signedExtendSplatAsDynamicShape() -> tensor<?xi64> {
-  %splat = arith.constant dense<5> : tensor<2xi16>
-  %extsplat = arith.extsi %splat : tensor<2xi16> to tensor<?xi64>
-  return %extsplat : tensor<?xi64>
-}
-
 // CHECK-LABEL: @extsi_i0
 //       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i16
 //       CHECK:   return %[[ZERO]] : i16
diff --git a/mlir/test/Dialect/Arith/invalid.mlir b/mlir/test/Dialect/Arith/invalid.mlir
index ada849220bb839..652aa738ad3924 100644
--- a/mlir/test/Dialect/Arith/invalid.mlir
+++ b/mlir/test/Dialect/Arith/invalid.mlir
@@ -1,13 +1,21 @@
 // RUN: mlir-opt -split-input-file %s -verify-diagnostics
 
 func.func @test_index_cast_shape_error(%arg0 : tensor<index>) -> tensor<2xi64> {
-  // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}}
+  // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}}
   %0 = arith.index_cast %arg0 : tensor<index> to tensor<2xi64>
   return %0 : tensor<2xi64>
 }
 
 // -----
 
+func.func @test_index_cast_shape_dim_error(%arg0 : tensor<2xindex>) -> tensor<?xi64> {
+  // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.index_cast %arg0 : tensor<2xindex> to tensor<?xi64>
+  return %0 : tensor<?xi64>
+}
+
+// -----
+
 func.func @test_index_cast_tensor_error(%arg0 : tensor<index>) -> i64 {
   // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}}
   %0 = arith.index_cast %arg0 : tensor<index> to i64
@@ -655,6 +663,14 @@ func.func @extsi_scalable_to_fl(%arg0 : vector<[4]xi32>) {
 
 // -----
 
+func.func @extsi_tensor_dim(%arg0 : tensor<4xi32>) {
+  // expected-error@+1 {{'arith.extsi' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.extsi %arg0 : tensor<4xi32> to tensor<?xi64>
+  return
+}
+
+// -----
+
 func.func @extf_scalable_to_fl(%arg0 : vector<[4]xf32>) {
   // expected-error@+1 {{'arith.extf' op requires the same shape for all operands and results}}
   %0 = arith.extf %arg0 : vector<[4]xf32> to vector<4xf64>
@@ -703,6 +719,22 @@ func.func @bitcast_scalable_to_fl(%arg0 : vector<[4]xf32>) {
 
 // -----
 
+func.func @bitcast_tensor_dim(%arg0 : tensor<4xf32>) {
+  // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.bitcast %arg0 : tensor<4xf32> to tensor<?xi32>
+  return
+}
+
+// -----
+
+func.func @bitcast_tensor_dim(%arg0 : tensor<?xf32>) {
+  // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.bitcast %arg0 : tensor<?xf32> to tensor<4xi32>
+  return
+}
+
+// -----
+
 func.func @trunci_fl_to_scalable(%arg0 : vector<4xi32>) {
   // expected-error@+1 {{'arith.trunci' op requires the same shape for all operands and results}}
   %0 = arith.trunci %arg0 : vector<4xi32> to vector<[4]xi8>
@@ -719,6 +751,14 @@ func.func @truncf_fl_to_scalable(%arg0 : vector<4xf64>) {
 
 // -----
 
+func.func @truncf_tensor_dim(%arg0 : tensor<4xf64>) {
+  // expected-error@+1 {{'arith.truncf' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.truncf %arg0 : tensor<4xf64> to tensor<?xf32>
+  return
+}
+
+// -----
+
 func.func @extui_fl_to_scalable(%arg0 : vector<4xi32>) {
   // expected-error@+1 {{'arith.extui' op requires the same shape for all operands and results}}
   %0 = arith.extui %arg0 : vector<4xi32> to vector<[4]xi64>