From 125bd061c3240afd92edd5ef8c29a7b3d24a5cd1 Mon Sep 17 00:00:00 2001 From: josel-amd <166385423+josel-amd@users.noreply.github.com> Date: Tue, 28 May 2024 14:36:24 +0200 Subject: [PATCH 01/89] [mlir][emitc] Support conversion of arith.divsi and arith.remsi to EmitC (#93450) --- mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp | 2 ++ mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp index 0be3d76f556de9..388794ec122d21 100644 --- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp +++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp @@ -394,7 +394,9 @@ void mlir::populateArithToEmitCPatterns(TypeConverter &typeConverter, ArithConstantOpConversionPattern, ArithOpConversion, ArithOpConversion, + ArithOpConversion, ArithOpConversion, + ArithOpConversion, ArithOpConversion, IntegerOpConversion, IntegerOpConversion, diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir index b453b69a214e86..dac3fd99b607ce 100644 --- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir +++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir @@ -88,6 +88,17 @@ func.func @arith_index(%arg0: index, %arg1: index) { // ----- +// CHECK-LABEL: arith_signed_integer_div_rem +func.func @arith_signed_integer_div_rem(%arg0: i32, %arg1: i32) { + // CHECK: emitc.div %arg0, %arg1 : (i32, i32) -> i32 + %0 = arith.divsi %arg0, %arg1 : i32 + // CHECK: emitc.rem %arg0, %arg1 : (i32, i32) -> i32 + %1 = arith.remsi %arg0, %arg1 : i32 + return +} + +// ----- + func.func @arith_select(%arg0: i1, %arg1: tensor<8xi32>, %arg2: tensor<8xi32>) -> () { // CHECK: [[V0:[^ ]*]] = emitc.conditional %arg0, %arg1, %arg2 : tensor<8xi32> %0 = arith.select %arg0, %arg1, %arg2 : i1, tensor<8xi32> From fe5d791517b1cc11bd518f0338516f157fe18661 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 23 May 2024 21:18:17 +0200 Subject: [PATCH 02/89] AMDGPU: Add some multi-use negative tests for minimum3/maximum3 --- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 206 ++++++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/fminimum3.ll | 206 ++++++++++++++++++++++++++ 2 files changed, 412 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 9690e126dfcfcb..3ec36f03a48aa4 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -3249,3 +3249,209 @@ define double @v_fmaximum3_f64_const1_const2(double %a) { %max1 = call double @llvm.maximum.f64(double %max0, double 16.0) ret double %max1 } + +define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c) { +; GFX12-LABEL: v_no_fmaximum3_f32__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f32 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f32 v1, v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_no_fmaximum3_f32__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + %insert.0 = insertelement <2 x float> poison, float %max0, i32 0 + %insert.1 = insertelement <2 x float> %insert.0, float %max1, i32 1 + ret <2 x float> %insert.1 +} + +define amdgpu_ps <2 x i32> @s_no_fmaximum3_f32__multi_use(float inreg %a, float inreg %b, float inreg %c) { +; GFX12-LABEL: s_no_fmaximum3_f32__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_maximum_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: s_maximum_f32 s1, s0, s2 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_no_fmaximum3_f32__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_max_f32_e32 v1, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v1, s2, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + %cast0 = bitcast float %max0 to i32 + %cast1 = bitcast float %max1 to i32 + %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0) + %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast1) + %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1 + ret <2 x i32> %insert.1 +} + +define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) { +; GFX12-LABEL: v_no_fmaximum3_f16__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f16 v1, v0, v2 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_no_fmaximum3_f16__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.maximum.f16(half %a, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + %insert.0 = insertelement <2 x half> poison, half %max0, i32 0 + %insert.1 = insertelement <2 x half> %insert.0, half %max1, i32 1 + ret <2 x half> %insert.1 +} + +define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half inreg %b, half inreg %c) { +; GFX12-LABEL: s_no_fmaximum3_f16__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_maximum_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-NEXT: s_maximum_f16 s1, s0, s2 +; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX12-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_no_fmaximum3_f16__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_max_f16_e32 v1, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_max_f16_e32 v1, s2, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog + %max0 = call half @llvm.maximum.f16(half %a, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + %cast0 = bitcast half %max0 to i16 + %cast1 = bitcast half %max1 to i16 + %ext0 = zext i16 %cast0 to i32 + %ext1 = zext i16 %cast1 to i32 + %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext0) + %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext1) + %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1 + ret <2 x i32> %insert.1 +} + +define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_no_fmaximum3_v2f16__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_maximum_f16 v1, v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_no_fmaximum3_v2f16__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v5, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.maximum.f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.maximum.f16(<2 x half> %max0, <2 x half> %c) + %concat = shufflevector <2 x half> %max0, <2 x half> %max1, <4 x i32> + ret <4 x half> %concat +} + +define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double %c) { +; GFX12-LABEL: v_no_fmaximum3_f64__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[2:3], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_no_fmaximum3_f64__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.maximum.f64(double %a, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double %c) + %insert.0 = insertelement <2 x double> poison, double %max0, i32 0 + %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1 + ret <2 x double> %insert.1 +} diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 7481fff251d895..0e0b73b88d2dca 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -3249,3 +3249,209 @@ define double @v_fminimum3_f64_const1_const2(double %a) { %max1 = call double @llvm.minimum.f64(double %max0, double 16.0) ret double %max1 } + +define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c) { +; GFX12-LABEL: v_no_fminimum3_f32__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f32 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f32 v1, v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_no_fminimum3_f32__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.minimum.f32(float %a, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + %insert.0 = insertelement <2 x float> poison, float %max0, i32 0 + %insert.1 = insertelement <2 x float> %insert.0, float %max1, i32 1 + ret <2 x float> %insert.1 +} + +define amdgpu_ps <2 x i32> @s_no_fminimum3_f32__multi_use(float inreg %a, float inreg %b, float inreg %c) { +; GFX12-LABEL: s_no_fminimum3_f32__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_minimum_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: s_minimum_f32 s1, s0, s2 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_no_fminimum3_f32__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_min_f32_e32 v1, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v1, s2, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog + %max0 = call float @llvm.minimum.f32(float %a, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + %cast0 = bitcast float %max0 to i32 + %cast1 = bitcast float %max1 to i32 + %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0) + %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast1) + %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1 + ret <2 x i32> %insert.1 +} + +define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) { +; GFX12-LABEL: v_no_fminimum3_f16__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f16 v1, v0, v2 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_no_fminimum3_f16__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.minimum.f16(half %a, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + %insert.0 = insertelement <2 x half> poison, half %max0, i32 0 + %insert.1 = insertelement <2 x half> %insert.0, half %max1, i32 1 + ret <2 x half> %insert.1 +} + +define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half inreg %b, half inreg %c) { +; GFX12-LABEL: s_no_fminimum3_f16__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_minimum_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-NEXT: s_minimum_f16 s1, s0, s2 +; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX12-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_no_fminimum3_f16__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_min_f16_e32 v1, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_min_f16_e32 v1, s2, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog + %max0 = call half @llvm.minimum.f16(half %a, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + %cast0 = bitcast half %max0 to i16 + %cast1 = bitcast half %max1 to i16 + %ext0 = zext i16 %cast0 to i32 + %ext1 = zext i16 %cast1 to i32 + %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext0) + %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext1) + %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1 + ret <2 x i32> %insert.1 +} + +define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_no_fminimum3_v2f16__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_minimum_f16 v1, v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_no_fminimum3_v2f16__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v5, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.minimum.f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.minimum.f16(<2 x half> %max0, <2 x half> %c) + %concat = shufflevector <2 x half> %max0, <2 x half> %max1, <4 x i32> + ret <4 x half> %concat +} + +define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double %c) { +; GFX12-LABEL: v_no_fminimum3_f64__multi_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[2:3], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_no_fminimum3_f64__multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.minimum.f64(double %a, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double %c) + %insert.0 = insertelement <2 x double> poison, double %max0, i32 0 + %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1 + ret <2 x double> %insert.1 +} From 1da52caf2946e56f69eae75a60088a54edda1db5 Mon Sep 17 00:00:00 2001 From: Kelvin Li Date: Tue, 28 May 2024 08:50:55 -0400 Subject: [PATCH 03/89] [flang] Fix typos PPC intrinsics tests (NFC) (#92943) --- flang/test/Lower/PowerPC/ppc-vec-load.f90 | 119 +++++++-------- .../Lower/PowerPC/ppc-vec-shift-be-le.f90 | 140 +++++++++--------- 2 files changed, 130 insertions(+), 129 deletions(-) diff --git a/flang/test/Lower/PowerPC/ppc-vec-load.f90 b/flang/test/Lower/PowerPC/ppc-vec-load.f90 index 4d51512df0f7b4..a81ed055ce08c8 100644 --- a/flang/test/Lower/PowerPC/ppc-vec-load.f90 +++ b/flang/test/Lower/PowerPC/ppc-vec-load.f90 @@ -1,12 +1,13 @@ -! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s -! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s +! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE","LLVM" %s +! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr9 -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR_P9","LLVM" %s +! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE","LLVM" %s ! REQUIRES: target=powerpc{{.*}} !---------------------- ! vec_ld !---------------------- -! CHECK-LABEL: @vec_ld_testi8 +! LLVM-LABEL: @vec_ld_testi8 subroutine vec_ld_testi8(arg1, arg2, res) integer(1) :: arg1 vector(integer(1)) :: arg2, res @@ -19,7 +20,7 @@ subroutine vec_ld_testi8(arg1, arg2, res) ! LLVMIR: store <16 x i8> %[[bc]], ptr %2, align 16 end subroutine vec_ld_testi8 -! CHECK-LABEL: @vec_ld_testi16 +! LLVM-LABEL: @vec_ld_testi16 subroutine vec_ld_testi16(arg1, arg2, res) integer(2) :: arg1 vector(integer(2)) :: arg2, res @@ -32,7 +33,7 @@ subroutine vec_ld_testi16(arg1, arg2, res) ! LLVMIR: store <8 x i16> %[[bc]], ptr %2, align 16 end subroutine vec_ld_testi16 -! CHECK-LABEL: @vec_ld_testi32 +! LLVM-LABEL: @vec_ld_testi32 subroutine vec_ld_testi32(arg1, arg2, res) integer(4) :: arg1 vector(integer(4)) :: arg2, res @@ -44,7 +45,7 @@ subroutine vec_ld_testi32(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[bc]], ptr %2, align 16 end subroutine vec_ld_testi32 -! CHECK-LABEL: @vec_ld_testf32 +! LLVM-LABEL: @vec_ld_testf32 subroutine vec_ld_testf32(arg1, arg2, res) integer(8) :: arg1 vector(real(4)) :: arg2, res @@ -58,7 +59,7 @@ subroutine vec_ld_testf32(arg1, arg2, res) ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16 end subroutine vec_ld_testf32 -! CHECK-LABEL: @vec_ld_testu32 +! LLVM-LABEL: @vec_ld_testu32 subroutine vec_ld_testu32(arg1, arg2, res) integer(1) :: arg1 vector(unsigned(4)) :: arg2, res @@ -70,7 +71,7 @@ subroutine vec_ld_testu32(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16 end subroutine vec_ld_testu32 -! CHECK-LABEL: @vec_ld_testi32a +! LLVM-LABEL: @vec_ld_testi32a subroutine vec_ld_testi32a(arg1, arg2, res) integer(4) :: arg1 integer(4) :: arg2(10) @@ -83,7 +84,7 @@ subroutine vec_ld_testi32a(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16 end subroutine vec_ld_testi32a -! CHECK-LABEL: @vec_ld_testf32av +! LLVM-LABEL: @vec_ld_testf32av subroutine vec_ld_testf32av(arg1, arg2, res) integer(8) :: arg1 vector(real(4)) :: arg2(2, 4, 8) @@ -98,7 +99,7 @@ subroutine vec_ld_testf32av(arg1, arg2, res) ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16 end subroutine vec_ld_testf32av -! CHECK-LABEL: @vec_ld_testi32s +! LLVM-LABEL: @vec_ld_testi32s subroutine vec_ld_testi32s(arg1, arg2, res) integer(4) :: arg1 real(4) :: arg2 @@ -116,7 +117,7 @@ end subroutine vec_ld_testi32s ! vec_lde !---------------------- -! CHECK-LABEL: @vec_lde_testi8s +! LLVM-LABEL: @vec_lde_testi8s subroutine vec_lde_testi8s(arg1, arg2, res) integer(1) :: arg1 integer(1) :: arg2 @@ -129,7 +130,7 @@ subroutine vec_lde_testi8s(arg1, arg2, res) ! LLVMIR: store <16 x i8> %[[call]], ptr %2, align 16 end subroutine vec_lde_testi8s -! CHECK-LABEL: @vec_lde_testi16a +! LLVM-LABEL: @vec_lde_testi16a subroutine vec_lde_testi16a(arg1, arg2, res) integer(2) :: arg1 integer(2) :: arg2(2, 4, 8) @@ -142,7 +143,7 @@ subroutine vec_lde_testi16a(arg1, arg2, res) ! LLVMIR: store <8 x i16> %[[call]], ptr %2, align 16 end subroutine vec_lde_testi16a -! CHECK-LABEL: @vec_lde_testi32a +! LLVM-LABEL: @vec_lde_testi32a subroutine vec_lde_testi32a(arg1, arg2, res) integer(4) :: arg1 integer(4) :: arg2(4) @@ -155,7 +156,7 @@ subroutine vec_lde_testi32a(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16 end subroutine vec_lde_testi32a -! CHECK-LABEL: @vec_lde_testf32a +! LLVM-LABEL: @vec_lde_testf32a subroutine vec_lde_testf32a(arg1, arg2, res) integer(8) :: arg1 real(4) :: arg2(4) @@ -173,7 +174,7 @@ end subroutine vec_lde_testf32a ! vec_ldl !---------------------- -! CHECK-LABEL: @vec_ldl_testi8 +! LLVM-LABEL: @vec_ldl_testi8 subroutine vec_ldl_testi8(arg1, arg2, res) integer(1) :: arg1 vector(integer(1)) :: arg2, res @@ -186,7 +187,7 @@ subroutine vec_ldl_testi8(arg1, arg2, res) ! LLVMIR: store <16 x i8> %[[bc]], ptr %2, align 16 end subroutine vec_ldl_testi8 -! CHECK-LABEL: @vec_ldl_testi16 +! LLVM-LABEL: @vec_ldl_testi16 subroutine vec_ldl_testi16(arg1, arg2, res) integer(2) :: arg1 vector(integer(2)) :: arg2, res @@ -199,7 +200,7 @@ subroutine vec_ldl_testi16(arg1, arg2, res) ! LLVMIR: store <8 x i16> %[[bc]], ptr %2, align 16 end subroutine vec_ldl_testi16 -! CHECK-LABEL: @vec_ldl_testi32 +! LLVM-LABEL: @vec_ldl_testi32 subroutine vec_ldl_testi32(arg1, arg2, res) integer(4) :: arg1 vector(integer(4)) :: arg2, res @@ -211,7 +212,7 @@ subroutine vec_ldl_testi32(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[bc]], ptr %2, align 16 end subroutine vec_ldl_testi32 -! CHECK-LABEL: @vec_ldl_testf32 +! LLVM-LABEL: @vec_ldl_testf32 subroutine vec_ldl_testf32(arg1, arg2, res) integer(8) :: arg1 vector(real(4)) :: arg2, res @@ -225,7 +226,7 @@ subroutine vec_ldl_testf32(arg1, arg2, res) ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16 end subroutine vec_ldl_testf32 -! CHECK-LABEL: @vec_ldl_testu32 +! LLVM-LABEL: @vec_ldl_testu32 subroutine vec_ldl_testu32(arg1, arg2, res) integer(1) :: arg1 vector(unsigned(4)) :: arg2, res @@ -237,7 +238,7 @@ subroutine vec_ldl_testu32(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16 end subroutine vec_ldl_testu32 -! CHECK-LABEL: @vec_ldl_testi32a +! LLVM-LABEL: @vec_ldl_testi32a subroutine vec_ldl_testi32a(arg1, arg2, res) integer(4) :: arg1 integer(4) :: arg2(10) @@ -250,7 +251,7 @@ subroutine vec_ldl_testi32a(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16 end subroutine vec_ldl_testi32a -! CHECK-LABEL: @vec_ldl_testf32av +! LLVM-LABEL: @vec_ldl_testf32av subroutine vec_ldl_testf32av(arg1, arg2, res) integer(8) :: arg1 vector(real(4)) :: arg2(2, 4, 8) @@ -264,7 +265,7 @@ subroutine vec_ldl_testf32av(arg1, arg2, res) ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16 end subroutine vec_ldl_testf32av -! CHECK-LABEL: @vec_ldl_testi32s +! LLVM-LABEL: @vec_ldl_testi32s subroutine vec_ldl_testi32s(arg1, arg2, res) integer(4) :: arg1 real(4) :: arg2 @@ -282,7 +283,7 @@ end subroutine vec_ldl_testi32s ! vec_lvsl !---------------------- -! CHECK-LABEL: @vec_lvsl_testi8s +! LLVM-LABEL: @vec_lvsl_testi8s subroutine vec_lvsl_testi8s(arg1, arg2, res) integer(1) :: arg1 integer(1) :: arg2 @@ -300,7 +301,7 @@ subroutine vec_lvsl_testi8s(arg1, arg2, res) ! LLVMIR-BE: store <16 x i8> %[[ld]], ptr %2, align 16 end subroutine vec_lvsl_testi8s -! CHECK-LABEL: @vec_lvsl_testi16a +! LLVM-LABEL: @vec_lvsl_testi16a subroutine vec_lvsl_testi16a(arg1, arg2, res) integer(2) :: arg1 integer(2) :: arg2(4) @@ -318,7 +319,7 @@ subroutine vec_lvsl_testi16a(arg1, arg2, res) ! LLVMIR-BE: store <16 x i8> %[[ld]], ptr %2, align 16 end subroutine vec_lvsl_testi16a -! CHECK-LABEL: @vec_lvsl_testi32a +! LLVM-LABEL: @vec_lvsl_testi32a subroutine vec_lvsl_testi32a(arg1, arg2, res) integer(4) :: arg1 integer(4) :: arg2(2, 3, 4) @@ -336,7 +337,7 @@ subroutine vec_lvsl_testi32a(arg1, arg2, res) ! LLVMIR-BE: store <16 x i8> %[[ld]], ptr %2, align 16 end subroutine vec_lvsl_testi32a -! CHECK-LABEL: @vec_lvsl_testf32a +! LLVM-LABEL: @vec_lvsl_testf32a subroutine vec_lvsl_testf32a(arg1, arg2, res) integer(8) :: arg1 real(4) :: arg2(4) @@ -357,7 +358,7 @@ end subroutine vec_lvsl_testf32a ! vec_lvsr !---------------------- -! CHECK-LABEL: @vec_lvsr_testi8s +! LLVM-LABEL: @vec_lvsr_testi8s subroutine vec_lvsr_testi8s(arg1, arg2, res) integer(1) :: arg1 integer(1) :: arg2 @@ -375,7 +376,7 @@ subroutine vec_lvsr_testi8s(arg1, arg2, res) ! LLVMIR-BE: store <16 x i8> %[[addr]], ptr %2, align 16 end subroutine vec_lvsr_testi8s -! CHECK-LABEL: @vec_lvsr_testi16a +! LLVM-LABEL: @vec_lvsr_testi16a subroutine vec_lvsr_testi16a(arg1, arg2, res) integer(2) :: arg1 integer(2) :: arg2(4) @@ -393,7 +394,7 @@ subroutine vec_lvsr_testi16a(arg1, arg2, res) ! LLVMIR-BE: store <16 x i8> %[[addr]], ptr %2, align 16 end subroutine vec_lvsr_testi16a -! CHECK-LABEL: @vec_lvsr_testi32a +! LLVM-LABEL: @vec_lvsr_testi32a subroutine vec_lvsr_testi32a(arg1, arg2, res) integer(4) :: arg1 integer(4) :: arg2(2, 3, 4) @@ -411,7 +412,7 @@ subroutine vec_lvsr_testi32a(arg1, arg2, res) ! LLVMIR-BE: store <16 x i8> %[[addr]], ptr %2, align 16 end subroutine vec_lvsr_testi32a -! CHECK-LABEL: @vec_lvsr_testf32a +! LLVM-LABEL: @vec_lvsr_testf32a subroutine vec_lvsr_testf32a(arg1, arg2, res) integer(8) :: arg1 real(4) :: arg2(4) @@ -432,7 +433,7 @@ end subroutine vec_lvsr_testf32a ! vec_lxv !---------------------- -! CHECK-LABEL: @vec_lxv_testi8a +! LLVM-LABEL: @vec_lxv_testi8a subroutine vec_lxv_testi8a(arg1, arg2, res) integer(1) :: arg1 integer(1) :: arg2(4) @@ -445,7 +446,7 @@ subroutine vec_lxv_testi8a(arg1, arg2, res) ! LLVMIR_P9: store <16 x i8> %[[ld]], ptr %2, align 16 end subroutine vec_lxv_testi8a -! CHECK-LABEL: @vec_lxv_testi16a +! LLVM-LABEL: @vec_lxv_testi16a subroutine vec_lxv_testi16a(arg1, arg2, res) integer(2) :: arg1 integer(2) :: arg2(2, 4, 8) @@ -458,7 +459,7 @@ subroutine vec_lxv_testi16a(arg1, arg2, res) ! LLVMIR_P9: store <8 x i16> %[[ld]], ptr %2, align 16 end subroutine vec_lxv_testi16a -! CHECK-LABEL: @vec_lxv_testi32a +! LLVM-LABEL: @vec_lxv_testi32a subroutine vec_lxv_testi32a(arg1, arg2, res) integer(4) :: arg1 integer(4) :: arg2(2, 4, 8) @@ -471,7 +472,7 @@ subroutine vec_lxv_testi32a(arg1, arg2, res) ! LLVMIR_P9: store <4 x i32> %[[ld]], ptr %2, align 16 end subroutine vec_lxv_testi32a -! CHECK-LABEL: @vec_lxv_testf32a +! LLVM-LABEL: @vec_lxv_testf32a subroutine vec_lxv_testf32a(arg1, arg2, res) integer(2) :: arg1 real(4) :: arg2(4) @@ -484,7 +485,7 @@ subroutine vec_lxv_testf32a(arg1, arg2, res) ! LLVMIR_P9: store <4 x float> %[[ld]], ptr %2, align 16 end subroutine vec_lxv_testf32a -! CHECK-LABEL: @vec_lxv_testf64a +! LLVM-LABEL: @vec_lxv_testf64a subroutine vec_lxv_testf64a(arg1, arg2, res) integer(8) :: arg1 real(8) :: arg2(4) @@ -501,7 +502,7 @@ end subroutine vec_lxv_testf64a ! vec_xld2 !---------------------- -! CHECK-LABEL: @vec_xld2_testi8a +! LLVM-LABEL: @vec_xld2_testi8a subroutine vec_xld2_testi8a(arg1, arg2, res) integer(1) :: arg1 vector(integer(1)) :: arg2(4) @@ -515,7 +516,7 @@ subroutine vec_xld2_testi8a(arg1, arg2, res) ! LLVMIR: store <16 x i8> %[[bc]], ptr %2, align 16 end subroutine vec_xld2_testi8a -! CHECK-LABEL: @vec_xld2_testi16 +! LLVM-LABEL: @vec_xld2_testi16 subroutine vec_xld2_testi16(arg1, arg2, res) integer :: arg1 vector(integer(2)) :: arg2 @@ -529,7 +530,7 @@ subroutine vec_xld2_testi16(arg1, arg2, res) ! LLVMIR: store <8 x i16> %[[bc]], ptr %2, align 16 end subroutine vec_xld2_testi16 -! CHECK-LABEL: @vec_xld2_testi32a +! LLVM-LABEL: @vec_xld2_testi32a subroutine vec_xld2_testi32a(arg1, arg2, res) integer(4) :: arg1 vector(integer(4)) :: arg2(41) @@ -543,7 +544,7 @@ subroutine vec_xld2_testi32a(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[bc]], ptr %2, align 16 end subroutine vec_xld2_testi32a -! CHECK-LABEL: @vec_xld2_testi64a +! LLVM-LABEL: @vec_xld2_testi64a subroutine vec_xld2_testi64a(arg1, arg2, res) integer(8) :: arg1 vector(integer(8)) :: arg2(4) @@ -557,7 +558,7 @@ subroutine vec_xld2_testi64a(arg1, arg2, res) ! LLVMIR: store <2 x i64> %[[bc]], ptr %2, align 16 end subroutine vec_xld2_testi64a -! CHECK-LABEL: @vec_xld2_testf32a +! LLVM-LABEL: @vec_xld2_testf32a subroutine vec_xld2_testf32a(arg1, arg2, res) integer(2) :: arg1 vector(real(4)) :: arg2(4) @@ -571,7 +572,7 @@ subroutine vec_xld2_testf32a(arg1, arg2, res) ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16 end subroutine vec_xld2_testf32a -! CHECK-LABEL: @vec_xld2_testf64a +! LLVM-LABEL: @vec_xld2_testf64a subroutine vec_xld2_testf64a(arg1, arg2, res) integer(8) :: arg1 vector(real(8)) :: arg2(4) @@ -588,7 +589,7 @@ end subroutine vec_xld2_testf64a ! vec_xl !---------------------- -! CHECK-LABEL: @vec_xl_testi8a +! LLVM-LABEL: @vec_xl_testi8a subroutine vec_xl_testi8a(arg1, arg2, res) integer(1) :: arg1 integer(1) :: arg2(4) @@ -601,7 +602,7 @@ subroutine vec_xl_testi8a(arg1, arg2, res) ! LLVMIR: store <16 x i8> %[[ld]], ptr %2, align 16 end subroutine vec_xl_testi8a -! CHECK-LABEL: @vec_xl_testi16a +! LLVM-LABEL: @vec_xl_testi16a subroutine vec_xl_testi16a(arg1, arg2, res) integer(2) :: arg1 integer(2) :: arg2(2, 4, 8) @@ -614,7 +615,7 @@ subroutine vec_xl_testi16a(arg1, arg2, res) ! LLVMIR: store <8 x i16> %[[ld]], ptr %2, align 16 end subroutine vec_xl_testi16a -! CHECK-LABEL: @vec_xl_testi32a +! LLVM-LABEL: @vec_xl_testi32a subroutine vec_xl_testi32a(arg1, arg2, res) integer(4) :: arg1 integer(4) :: arg2(2, 4, 8) @@ -627,7 +628,7 @@ subroutine vec_xl_testi32a(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[ld]], ptr %2, align 16 end subroutine vec_xl_testi32a -! CHECK-LABEL: @vec_xl_testi64a +! LLVM-LABEL: @vec_xl_testi64a subroutine vec_xl_testi64a(arg1, arg2, res) integer(8) :: arg1 integer(8) :: arg2(2, 4, 8) @@ -641,7 +642,7 @@ subroutine vec_xl_testi64a(arg1, arg2, res) ! LLVMIR: store <2 x i64> %[[bc]], ptr %2, align 16 end subroutine vec_xl_testi64a -! CHECK-LABEL: @vec_xl_testf32a +! LLVM-LABEL: @vec_xl_testf32a subroutine vec_xl_testf32a(arg1, arg2, res) integer(2) :: arg1 real(4) :: arg2(4) @@ -655,7 +656,7 @@ subroutine vec_xl_testf32a(arg1, arg2, res) ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16 end subroutine vec_xl_testf32a -! CHECK-LABEL: @vec_xl_testf64a +! LLVM-LABEL: @vec_xl_testf64a subroutine vec_xl_testf64a(arg1, arg2, res) integer(8) :: arg1 real(8) :: arg2 @@ -672,7 +673,7 @@ end subroutine vec_xl_testf64a ! vec_xlds !---------------------- -! CHECK-LABEL: @vec_xlds_testi64a +! LLVM-LABEL: @vec_xlds_testi64a subroutine vec_xlds_testi64a(arg1, arg2, res) integer(8) :: arg1 vector(integer(8)) :: arg2(4) @@ -687,7 +688,7 @@ subroutine vec_xlds_testi64a(arg1, arg2, res) ! LLVMIR: store <2 x i64> %[[shfl]], ptr %2, align 16 end subroutine vec_xlds_testi64a -! CHECK-LABEL: @vec_xlds_testf64a +! LLVM-LABEL: @vec_xlds_testf64a subroutine vec_xlds_testf64a(arg1, arg2, res) integer(8) :: arg1 vector(real(8)) :: arg2(4) @@ -707,7 +708,7 @@ end subroutine vec_xlds_testf64a ! vec_xl_be !---------------------- -! CHECK-LABEL: @vec_xl_be_testi8a +! LLVM-LABEL: @vec_xl_be_testi8a subroutine vec_xl_be_testi8a(arg1, arg2, res) integer(1) :: arg1 integer(1) :: arg2(2, 4, 8) @@ -722,7 +723,7 @@ subroutine vec_xl_be_testi8a(arg1, arg2, res) ! LLVMIR-BE: store <16 x i8> %[[ld]], ptr %2, align 16 end subroutine vec_xl_be_testi8a -! CHECK-LABEL: @vec_xl_be_testi16a +! LLVM-LABEL: @vec_xl_be_testi16a subroutine vec_xl_be_testi16a(arg1, arg2, res) integer(2) :: arg1 integer(2) :: arg2(2, 4, 8) @@ -737,7 +738,7 @@ subroutine vec_xl_be_testi16a(arg1, arg2, res) ! LLVMIR-BE: store <8 x i16> %[[ld]], ptr %2, align 16 end subroutine vec_xl_be_testi16a -! CHECK-LABEL: @vec_xl_be_testi32a +! LLVM-LABEL: @vec_xl_be_testi32a subroutine vec_xl_be_testi32a(arg1, arg2, res) integer(4) :: arg1 integer(4) :: arg2(2, 4, 8) @@ -752,7 +753,7 @@ subroutine vec_xl_be_testi32a(arg1, arg2, res) ! LLVMIR-BE: store <4 x i32> %[[ld]], ptr %2, align 16 end subroutine vec_xl_be_testi32a -! CHECK-LABEL: @vec_xl_be_testi64a +! LLVM-LABEL: @vec_xl_be_testi64a subroutine vec_xl_be_testi64a(arg1, arg2, res) integer(8) :: arg1 integer(8) :: arg2(2, 4, 8) @@ -767,7 +768,7 @@ subroutine vec_xl_be_testi64a(arg1, arg2, res) ! LLVMIR-BE: store <2 x i64> %[[ld]], ptr %2, align 16 end subroutine vec_xl_be_testi64a -! CHECK-LABEL: @vec_xl_be_testf32a +! LLVM-LABEL: @vec_xl_be_testf32a subroutine vec_xl_be_testf32a(arg1, arg2, res) integer(2) :: arg1 real(4) :: arg2(4) @@ -782,7 +783,7 @@ subroutine vec_xl_be_testf32a(arg1, arg2, res) ! LLVMIR-BE: store <4 x float> %[[ld]], ptr %2, align 16 end subroutine vec_xl_be_testf32a -! CHECK-LABEL: @vec_xl_be_testf64a +! LLVM-LABEL: @vec_xl_be_testf64a subroutine vec_xl_be_testf64a(arg1, arg2, res) integer(8) :: arg1 real(8) :: arg2(7) @@ -801,7 +802,7 @@ end subroutine vec_xl_be_testf64a ! vec_xlw4 !---------------------- -! CHECK-LABEL: @vec_xlw4_testi8a +! LLVM-LABEL: @vec_xlw4_testi8a subroutine vec_xlw4_testi8a(arg1, arg2, res) integer(1) :: arg1 vector(integer(1)) :: arg2(2, 4, 8) @@ -815,7 +816,7 @@ subroutine vec_xlw4_testi8a(arg1, arg2, res) ! LLVMIR: store <16 x i8> %[[res]], ptr %2, align 16 end subroutine vec_xlw4_testi8a -! CHECK-LABEL: @vec_xlw4_testi16a +! LLVM-LABEL: @vec_xlw4_testi16a subroutine vec_xlw4_testi16a(arg1, arg2, res) integer(2) :: arg1 vector(integer(2)) :: arg2(2, 4, 8) @@ -829,7 +830,7 @@ subroutine vec_xlw4_testi16a(arg1, arg2, res) ! LLVMIR: store <8 x i16> %[[res]], ptr %2, align 16 end subroutine vec_xlw4_testi16a -! CHECK-LABEL: @vec_xlw4_testu32a +! LLVM-LABEL: @vec_xlw4_testu32a subroutine vec_xlw4_testu32a(arg1, arg2, res) integer(4) :: arg1 vector(unsigned(4)) :: arg2(2, 4, 8) @@ -842,7 +843,7 @@ subroutine vec_xlw4_testu32a(arg1, arg2, res) ! LLVMIR: store <4 x i32> %[[ld]], ptr %2, align 16 end subroutine vec_xlw4_testu32a -! CHECK-LABEL: @vec_xlw4_testf32a +! LLVM-LABEL: @vec_xlw4_testf32a subroutine vec_xlw4_testf32a(arg1, arg2, res) integer(2) :: arg1 vector(real(4)) :: arg2(4) diff --git a/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90 b/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90 index bd83f28b4eeb52..6c4f202f89a456 100644 --- a/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90 +++ b/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90 @@ -1,13 +1,13 @@ -! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="CHECK" %s +! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR","LLVM" %s ! -! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64-unknown-aix -o - | FileCheck --check-prefixes="BE-LLVMIR" %s +! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64-unknown-aix -o - | FileCheck --check-prefixes="BE-LLVMIR","LLVM" %s ! REQUIRES: target=powerpc{{.*}} !---------------------- ! vec_sld !---------------------- -! CHECK-LABEL: vec_sld_test_i1i1 +! LLVM-LABEL: vec_sld_test_i1i1 subroutine vec_sld_test_i1i1(arg1, arg2) vector(integer(1)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_1) @@ -23,7 +23,7 @@ subroutine vec_sld_test_i1i1(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i1i1 -! CHECK-LABEL: vec_sld_test_i1i2 +! LLVM-LABEL: vec_sld_test_i1i2 subroutine vec_sld_test_i1i2(arg1, arg2) vector(integer(1)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_2) @@ -39,7 +39,7 @@ subroutine vec_sld_test_i1i2(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i1i2 -! CHECK-LABEL: vec_sld_test_i1i4 +! LLVM-LABEL: vec_sld_test_i1i4 subroutine vec_sld_test_i1i4(arg1, arg2) vector(integer(1)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_4) @@ -55,7 +55,7 @@ subroutine vec_sld_test_i1i4(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i1i4 -! CHECK-LABEL: vec_sld_test_i1i8 +! LLVM-LABEL: vec_sld_test_i1i8 subroutine vec_sld_test_i1i8(arg1, arg2) vector(integer(1)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_8) @@ -71,7 +71,7 @@ subroutine vec_sld_test_i1i8(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i1i8 -! CHECK-LABEL: vec_sld_test_i2i1 +! LLVM-LABEL: vec_sld_test_i2i1 subroutine vec_sld_test_i2i1(arg1, arg2) vector(integer(2)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_1) @@ -93,7 +93,7 @@ subroutine vec_sld_test_i2i1(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i2i1 -! CHECK-LABEL: vec_sld_test_i2i2 +! LLVM-LABEL: vec_sld_test_i2i2 subroutine vec_sld_test_i2i2(arg1, arg2) vector(integer(2)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 8_2) @@ -115,7 +115,7 @@ subroutine vec_sld_test_i2i2(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i2i2 -! CHECK-LABEL: vec_sld_test_i2i4 +! LLVM-LABEL: vec_sld_test_i2i4 subroutine vec_sld_test_i2i4(arg1, arg2) vector(integer(2)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_4) @@ -137,7 +137,7 @@ subroutine vec_sld_test_i2i4(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i2i4 -! CHECK-LABEL: vec_sld_test_i2i8 +! LLVM-LABEL: vec_sld_test_i2i8 subroutine vec_sld_test_i2i8(arg1, arg2) vector(integer(2)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 11_8) @@ -159,7 +159,7 @@ subroutine vec_sld_test_i2i8(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i2i8 -! CHECK-LABEL: vec_sld_test_i4i1 +! LLVM-LABEL: vec_sld_test_i4i1 subroutine vec_sld_test_i4i1(arg1, arg2) vector(integer(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_1) @@ -181,7 +181,7 @@ subroutine vec_sld_test_i4i1(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i4i1 -! CHECK-LABEL: vec_sld_test_i4i2 +! LLVM-LABEL: vec_sld_test_i4i2 subroutine vec_sld_test_i4i2(arg1, arg2) vector(integer(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_2) @@ -203,7 +203,7 @@ subroutine vec_sld_test_i4i2(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i4i2 -! CHECK-LABEL: vec_sld_test_i4i4 +! LLVM-LABEL: vec_sld_test_i4i4 subroutine vec_sld_test_i4i4(arg1, arg2) vector(integer(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_4) @@ -225,7 +225,7 @@ subroutine vec_sld_test_i4i4(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i4i4 -! CHECK-LABEL: vec_sld_test_i4i8 +! LLVM-LABEL: vec_sld_test_i4i8 subroutine vec_sld_test_i4i8(arg1, arg2) vector(integer(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_8) @@ -247,7 +247,7 @@ subroutine vec_sld_test_i4i8(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_i4i8 -! CHECK-LABEL: vec_sld_test_u1i1 +! LLVM-LABEL: vec_sld_test_u1i1 subroutine vec_sld_test_u1i1(arg1, arg2) vector(unsigned(1)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_1) @@ -263,7 +263,7 @@ subroutine vec_sld_test_u1i1(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u1i1 -! CHECK-LABEL: vec_sld_test_u1i2 +! LLVM-LABEL: vec_sld_test_u1i2 subroutine vec_sld_test_u1i2(arg1, arg2) vector(unsigned(1)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_2) @@ -279,7 +279,7 @@ subroutine vec_sld_test_u1i2(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u1i2 -! CHECK-LABEL: vec_sld_test_u1i4 +! LLVM-LABEL: vec_sld_test_u1i4 subroutine vec_sld_test_u1i4(arg1, arg2) vector(unsigned(1)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_1) @@ -295,7 +295,7 @@ subroutine vec_sld_test_u1i4(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u1i4 -! CHECK-LABEL: vec_sld_test_u1i8 +! LLVM-LABEL: vec_sld_test_u1i8 subroutine vec_sld_test_u1i8(arg1, arg2) vector(unsigned(1)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_1) @@ -311,7 +311,7 @@ subroutine vec_sld_test_u1i8(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u1i8 -! CHECK-LABEL: vec_sld_test_u2i1 +! LLVM-LABEL: vec_sld_test_u2i1 subroutine vec_sld_test_u2i1(arg1, arg2) vector(unsigned(2)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_1) @@ -333,7 +333,7 @@ subroutine vec_sld_test_u2i1(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u2i1 -! CHECK-LABEL: vec_sld_test_u2i2 +! LLVM-LABEL: vec_sld_test_u2i2 subroutine vec_sld_test_u2i2(arg1, arg2) vector(unsigned(2)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_2) @@ -355,7 +355,7 @@ subroutine vec_sld_test_u2i2(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u2i2 -! CHECK-LABEL: vec_sld_test_u2i4 +! LLVM-LABEL: vec_sld_test_u2i4 subroutine vec_sld_test_u2i4(arg1, arg2) vector(unsigned(2)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_4) @@ -377,7 +377,7 @@ subroutine vec_sld_test_u2i4(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u2i4 -! CHECK-LABEL: vec_sld_test_u2i8 +! LLVM-LABEL: vec_sld_test_u2i8 subroutine vec_sld_test_u2i8(arg1, arg2) vector(unsigned(2)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_8) @@ -399,7 +399,7 @@ subroutine vec_sld_test_u2i8(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u2i8 -! CHECK-LABEL: vec_sld_test_u4i1 +! LLVM-LABEL: vec_sld_test_u4i1 subroutine vec_sld_test_u4i1(arg1, arg2) vector(unsigned(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_1) @@ -421,7 +421,7 @@ subroutine vec_sld_test_u4i1(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u4i1 -! CHECK-LABEL: vec_sld_test_u4i2 +! LLVM-LABEL: vec_sld_test_u4i2 subroutine vec_sld_test_u4i2(arg1, arg2) vector(unsigned(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_2) @@ -443,7 +443,7 @@ subroutine vec_sld_test_u4i2(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u4i2 -! CHECK-LABEL: vec_sld_test_u4i4 +! LLVM-LABEL: vec_sld_test_u4i4 subroutine vec_sld_test_u4i4(arg1, arg2) vector(unsigned(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_4) @@ -465,7 +465,7 @@ subroutine vec_sld_test_u4i4(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u4i4 -! CHECK-LABEL: vec_sld_test_u4i8 +! LLVM-LABEL: vec_sld_test_u4i8 subroutine vec_sld_test_u4i8(arg1, arg2) vector(unsigned(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_8) @@ -487,7 +487,7 @@ subroutine vec_sld_test_u4i8(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_u4i8 -! CHECK-LABEL: vec_sld_test_r4i1 +! LLVM-LABEL: vec_sld_test_r4i1 subroutine vec_sld_test_r4i1(arg1, arg2) vector(real(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_1) @@ -509,7 +509,7 @@ subroutine vec_sld_test_r4i1(arg1, arg2) ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_r4i1 -! CHECK-LABEL: vec_sld_test_r4i2 +! LLVM-LABEL: vec_sld_test_r4i2 subroutine vec_sld_test_r4i2(arg1, arg2) vector(real(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_2) @@ -531,7 +531,7 @@ subroutine vec_sld_test_r4i2(arg1, arg2) ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_r4i2 -! CHECK-LABEL: vec_sld_test_r4i4 +! LLVM-LABEL: vec_sld_test_r4i4 subroutine vec_sld_test_r4i4(arg1, arg2) vector(real(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 3_4) @@ -553,7 +553,7 @@ subroutine vec_sld_test_r4i4(arg1, arg2) ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sld_test_r4i4 -! CHECK-LABEL: vec_sld_test_r4i8 +! LLVM-LABEL: vec_sld_test_r4i8 subroutine vec_sld_test_r4i8(arg1, arg2) vector(real(4)) :: arg1, arg2, r r = vec_sld(arg1, arg2, 1_8) @@ -578,7 +578,7 @@ end subroutine vec_sld_test_r4i8 !---------------------- ! vec_sldw !---------------------- -! CHECK-LABEL: vec_sldw_test_i1i1 +! LLVM-LABEL: vec_sldw_test_i1i1 subroutine vec_sldw_test_i1i1(arg1, arg2) vector(integer(1)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -594,7 +594,7 @@ subroutine vec_sldw_test_i1i1(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i1i1 -! CHECK-LABEL: vec_sldw_test_i1i2 +! LLVM-LABEL: vec_sldw_test_i1i2 subroutine vec_sldw_test_i1i2(arg1, arg2) vector(integer(1)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -610,7 +610,7 @@ subroutine vec_sldw_test_i1i2(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i1i2 -! CHECK-LABEL: vec_sldw_test_i1i4 +! LLVM-LABEL: vec_sldw_test_i1i4 subroutine vec_sldw_test_i1i4(arg1, arg2) vector(integer(1)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -626,7 +626,7 @@ subroutine vec_sldw_test_i1i4(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i1i4 -! CHECK-LABEL: vec_sldw_test_i1i8 +! LLVM-LABEL: vec_sldw_test_i1i8 subroutine vec_sldw_test_i1i8(arg1, arg2) vector(integer(1)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) @@ -642,7 +642,7 @@ subroutine vec_sldw_test_i1i8(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i1i8 -! CHECK-LABEL: vec_sldw_test_i2i1 +! LLVM-LABEL: vec_sldw_test_i2i1 subroutine vec_sldw_test_i2i1(arg1, arg2) vector(integer(2)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -664,7 +664,7 @@ subroutine vec_sldw_test_i2i1(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i2i1 -! CHECK-LABEL: vec_sldw_test_i2i2 +! LLVM-LABEL: vec_sldw_test_i2i2 subroutine vec_sldw_test_i2i2(arg1, arg2) vector(integer(2)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -686,7 +686,7 @@ subroutine vec_sldw_test_i2i2(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i2i2 -! CHECK-LABEL: vec_sldw_test_i2i4 +! LLVM-LABEL: vec_sldw_test_i2i4 subroutine vec_sldw_test_i2i4(arg1, arg2) vector(integer(2)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -708,7 +708,7 @@ subroutine vec_sldw_test_i2i4(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i2i4 -! CHECK-LABEL: vec_sldw_test_i2i8 +! LLVM-LABEL: vec_sldw_test_i2i8 subroutine vec_sldw_test_i2i8(arg1, arg2) vector(integer(2)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) @@ -730,7 +730,7 @@ subroutine vec_sldw_test_i2i8(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i2i8 -! CHECK-LABEL: vec_sldw_test_i4i1 +! LLVM-LABEL: vec_sldw_test_i4i1 subroutine vec_sldw_test_i4i1(arg1, arg2) vector(integer(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -752,7 +752,7 @@ subroutine vec_sldw_test_i4i1(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i4i1 -! CHECK-LABEL: vec_sldw_test_i4i2 +! LLVM-LABEL: vec_sldw_test_i4i2 subroutine vec_sldw_test_i4i2(arg1, arg2) vector(integer(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -774,7 +774,7 @@ subroutine vec_sldw_test_i4i2(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i4i2 -! CHECK-LABEL: vec_sldw_test_i4i4 +! LLVM-LABEL: vec_sldw_test_i4i4 subroutine vec_sldw_test_i4i4(arg1, arg2) vector(integer(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -796,7 +796,7 @@ subroutine vec_sldw_test_i4i4(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i4i4 -! CHECK-LABEL: vec_sldw_test_i4i8 +! LLVM-LABEL: vec_sldw_test_i4i8 subroutine vec_sldw_test_i4i8(arg1, arg2) vector(integer(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) @@ -818,7 +818,7 @@ subroutine vec_sldw_test_i4i8(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i4i8 -! CHECK-LABEL: vec_sldw_test_i8i1 +! LLVM-LABEL: vec_sldw_test_i8i1 subroutine vec_sldw_test_i8i1(arg1, arg2) vector(integer(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -840,7 +840,7 @@ subroutine vec_sldw_test_i8i1(arg1, arg2) ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i8i1 -! CHECK-LABEL: vec_sldw_test_i8i2 +! LLVM-LABEL: vec_sldw_test_i8i2 subroutine vec_sldw_test_i8i2(arg1, arg2) vector(integer(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -862,7 +862,7 @@ subroutine vec_sldw_test_i8i2(arg1, arg2) ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i8i2 -! CHECK-LABEL: vec_sldw_test_i8i4 +! LLVM-LABEL: vec_sldw_test_i8i4 subroutine vec_sldw_test_i8i4(arg1, arg2) vector(integer(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -884,7 +884,7 @@ subroutine vec_sldw_test_i8i4(arg1, arg2) ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_i8i4 -! CHECK-LABEL: vec_sldw_test_i8i8 +! LLVM-LABEL: vec_sldw_test_i8i8 subroutine vec_sldw_test_i8i8(arg1, arg2) vector(integer(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) @@ -907,7 +907,7 @@ subroutine vec_sldw_test_i8i8(arg1, arg2) end subroutine vec_sldw_test_i8i8 -! CHECK-LABEL: vec_sldw_test_u1i1 +! LLVM-LABEL: vec_sldw_test_u1i1 subroutine vec_sldw_test_u1i1(arg1, arg2) vector(unsigned(1)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -923,7 +923,7 @@ subroutine vec_sldw_test_u1i1(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u1i1 -! CHECK-LABEL: vec_sldw_test_u1i2 +! LLVM-LABEL: vec_sldw_test_u1i2 subroutine vec_sldw_test_u1i2(arg1, arg2) vector(unsigned(1)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -939,7 +939,7 @@ subroutine vec_sldw_test_u1i2(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u1i2 -! CHECK-LABEL: vec_sldw_test_u1i4 +! LLVM-LABEL: vec_sldw_test_u1i4 subroutine vec_sldw_test_u1i4(arg1, arg2) vector(unsigned(1)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -955,7 +955,7 @@ subroutine vec_sldw_test_u1i4(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u1i4 -! CHECK-LABEL: vec_sldw_test_u1i8 +! LLVM-LABEL: vec_sldw_test_u1i8 subroutine vec_sldw_test_u1i8(arg1, arg2) vector(unsigned(1)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) @@ -971,7 +971,7 @@ subroutine vec_sldw_test_u1i8(arg1, arg2) ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u1i8 -! CHECK-LABEL: vec_sldw_test_u2i1 +! LLVM-LABEL: vec_sldw_test_u2i1 subroutine vec_sldw_test_u2i1(arg1, arg2) vector(unsigned(2)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -993,7 +993,7 @@ subroutine vec_sldw_test_u2i1(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u2i1 -! CHECK-LABEL: vec_sldw_test_u2i2 +! LLVM-LABEL: vec_sldw_test_u2i2 subroutine vec_sldw_test_u2i2(arg1, arg2) vector(unsigned(2)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -1015,7 +1015,7 @@ subroutine vec_sldw_test_u2i2(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u2i2 -! CHECK-LABEL: vec_sldw_test_u2i4 +! LLVM-LABEL: vec_sldw_test_u2i4 subroutine vec_sldw_test_u2i4(arg1, arg2) vector(unsigned(2)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -1037,7 +1037,7 @@ subroutine vec_sldw_test_u2i4(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u2i4 -! CHECK-LABEL: vec_sldw_test_u2i8 +! LLVM-LABEL: vec_sldw_test_u2i8 subroutine vec_sldw_test_u2i8(arg1, arg2) vector(unsigned(2)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) @@ -1059,7 +1059,7 @@ subroutine vec_sldw_test_u2i8(arg1, arg2) ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u2i8 -! CHECK-LABEL: vec_sldw_test_u4i1 +! LLVM-LABEL: vec_sldw_test_u4i1 subroutine vec_sldw_test_u4i1(arg1, arg2) vector(unsigned(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -1081,7 +1081,7 @@ subroutine vec_sldw_test_u4i1(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u4i1 -! CHECK-LABEL: vec_sldw_test_u4i2 +! LLVM-LABEL: vec_sldw_test_u4i2 subroutine vec_sldw_test_u4i2(arg1, arg2) vector(unsigned(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -1103,7 +1103,7 @@ subroutine vec_sldw_test_u4i2(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u4i2 -! CHECK-LABEL: vec_sldw_test_u4i4 +! LLVM-LABEL: vec_sldw_test_u4i4 subroutine vec_sldw_test_u4i4(arg1, arg2) vector(unsigned(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -1125,7 +1125,7 @@ subroutine vec_sldw_test_u4i4(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u4i4 -! CHECK-LABEL: vec_sldw_test_u4i8 +! LLVM-LABEL: vec_sldw_test_u4i8 subroutine vec_sldw_test_u4i8(arg1, arg2) vector(unsigned(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) @@ -1147,7 +1147,7 @@ subroutine vec_sldw_test_u4i8(arg1, arg2) ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u4i8 -! CHECK-LABEL: vec_sldw_test_u8i1 +! LLVM-LABEL: vec_sldw_test_u8i1 subroutine vec_sldw_test_u8i1(arg1, arg2) vector(unsigned(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -1169,7 +1169,7 @@ subroutine vec_sldw_test_u8i1(arg1, arg2) ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u8i1 -! CHECK-LABEL: vec_sldw_test_u8i2 +! LLVM-LABEL: vec_sldw_test_u8i2 subroutine vec_sldw_test_u8i2(arg1, arg2) vector(unsigned(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -1191,7 +1191,7 @@ subroutine vec_sldw_test_u8i2(arg1, arg2) ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u8i2 -! CHECK-LABEL: vec_sldw_test_u8i4 +! LLVM-LABEL: vec_sldw_test_u8i4 subroutine vec_sldw_test_u8i4(arg1, arg2) vector(unsigned(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -1213,7 +1213,7 @@ subroutine vec_sldw_test_u8i4(arg1, arg2) ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u8i4 -! CHECK-LABEL: vec_sldw_test_u8i8 +! LLVM-LABEL: vec_sldw_test_u8i8 subroutine vec_sldw_test_u8i8(arg1, arg2) vector(unsigned(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) @@ -1235,7 +1235,7 @@ subroutine vec_sldw_test_u8i8(arg1, arg2) ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_u8i8 -! CHECK-LABEL: vec_sldw_test_r4i1 +! LLVM-LABEL: vec_sldw_test_r4i1 subroutine vec_sldw_test_r4i1(arg1, arg2) vector(real(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -1257,7 +1257,7 @@ subroutine vec_sldw_test_r4i1(arg1, arg2) ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_r4i1 -! CHECK-LABEL: vec_sldw_test_r4i2 +! LLVM-LABEL: vec_sldw_test_r4i2 subroutine vec_sldw_test_r4i2(arg1, arg2) vector(real(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -1279,7 +1279,7 @@ subroutine vec_sldw_test_r4i2(arg1, arg2) ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_r4i2 -! CHECK-LABEL: vec_sldw_test_r4i4 +! LLVM-LABEL: vec_sldw_test_r4i4 subroutine vec_sldw_test_r4i4(arg1, arg2) vector(real(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -1301,7 +1301,7 @@ subroutine vec_sldw_test_r4i4(arg1, arg2) ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_r4i4 -! CHECK-LABEL: vec_sldw_test_r4i8 +! LLVM-LABEL: vec_sldw_test_r4i8 subroutine vec_sldw_test_r4i8(arg1, arg2) vector(real(4)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) @@ -1323,7 +1323,7 @@ subroutine vec_sldw_test_r4i8(arg1, arg2) ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_r4i8 -! CHECK-LABEL: vec_sldw_test_r8i1 +! LLVM-LABEL: vec_sldw_test_r8i1 subroutine vec_sldw_test_r8i1(arg1, arg2) vector(real(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_1) @@ -1345,7 +1345,7 @@ subroutine vec_sldw_test_r8i1(arg1, arg2) ! BE-LLVMIR: store <2 x double> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_r8i1 -! CHECK-LABEL: vec_sldw_test_r8i2 +! LLVM-LABEL: vec_sldw_test_r8i2 subroutine vec_sldw_test_r8i2(arg1, arg2) vector(real(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_2) @@ -1367,7 +1367,7 @@ subroutine vec_sldw_test_r8i2(arg1, arg2) ! BE-LLVMIR: store <2 x double> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_r8i2 -! CHECK-LABEL: vec_sldw_test_r8i4 +! LLVM-LABEL: vec_sldw_test_r8i4 subroutine vec_sldw_test_r8i4(arg1, arg2) vector(real(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_4) @@ -1389,7 +1389,7 @@ subroutine vec_sldw_test_r8i4(arg1, arg2) ! BE-LLVMIR: store <2 x double> %[[br]], ptr %{{.*}}, align 16 end subroutine vec_sldw_test_r8i4 -! CHECK-LABEL: vec_sldw_test_r8i8 +! LLVM-LABEL: vec_sldw_test_r8i8 subroutine vec_sldw_test_r8i8(arg1, arg2) vector(real(8)) :: arg1, arg2, r r = vec_sldw(arg1, arg2, 3_8) From 01fbc5658cdfa152519e2d0842ccf7d91aaeaeaf Mon Sep 17 00:00:00 2001 From: Artem Kroviakov <71938912+akroviakov@users.noreply.github.com> Date: Tue, 28 May 2024 14:54:37 +0200 Subject: [PATCH 04/89] [mlir][vector] Add support for linearizing Insert VectorOp in VectorLinearize (#92370) Building on top of [#88204](https://github.com/llvm/llvm-project/pull/88204), this PR adds support for converting `vector.insert` into an equivalent `vector.shuffle` operation that operates on linearized (1-D) vectors. --- .../Vector/Transforms/VectorLinearize.cpp | 97 ++++++++++++++++++- mlir/test/Dialect/Vector/linearize.mlir | 29 ++++++ 2 files changed, 125 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp index 802a64b0805ee4..156bf742f6297a 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp @@ -44,6 +44,19 @@ static bool isLessThanTargetBitWidth(Operation *op, unsigned targetBitWidth) { return true; } +static bool isLessThanOrEqualTargetBitWidth(Type t, unsigned targetBitWidth) { + VectorType vecType = dyn_cast(t); + // Reject index since getElementTypeBitWidth will abort for Index types. + if (!vecType || vecType.getElementType().isIndex()) + return false; + // There are no dimension to fold if it is a 0-D vector. + if (vecType.getRank() == 0) + return false; + unsigned trailingVecDimBitWidth = + vecType.getShape().back() * vecType.getElementTypeBitWidth(); + return trailingVecDimBitWidth <= targetBitWidth; +} + namespace { struct LinearizeConstant final : OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -355,6 +368,88 @@ struct LinearizeVectorExtract final return success(); } +private: + unsigned targetVectorBitWidth; +}; + +/// This pattern converts the InsertOp to a ShuffleOp that works on a +/// linearized vector. +/// Following, +/// vector.insert %source %destination [ position ] +/// is converted to : +/// %source_1d = vector.shape_cast %source +/// %destination_1d = vector.shape_cast %destination +/// %out_1d = vector.shuffle %destination_1d, %source_1d [ shuffle_indices_1d +/// ] %out_nd = vector.shape_cast %out_1d +/// `shuffle_indices_1d` is computed using the position of the original insert. +struct LinearizeVectorInsert final + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LinearizeVectorInsert( + const TypeConverter &typeConverter, MLIRContext *context, + unsigned targetVectBitWidth = std::numeric_limits::max(), + PatternBenefit benefit = 1) + : OpConversionPattern(typeConverter, context, benefit), + targetVectorBitWidth(targetVectBitWidth) {} + LogicalResult + matchAndRewrite(vector::InsertOp insertOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type dstTy = getTypeConverter()->convertType(insertOp.getDestVectorType()); + assert(!(insertOp.getDestVectorType().isScalable() || + cast(dstTy).isScalable()) && + "scalable vectors are not supported."); + + if (!isLessThanOrEqualTargetBitWidth(insertOp.getSourceType(), + targetVectorBitWidth)) + return rewriter.notifyMatchFailure( + insertOp, "Can't flatten since targetBitWidth < OpSize"); + + // dynamic position is not supported + if (insertOp.hasDynamicPosition()) + return rewriter.notifyMatchFailure(insertOp, + "dynamic position is not supported."); + auto srcTy = insertOp.getSourceType(); + auto srcAsVec = dyn_cast(srcTy); + uint64_t srcSize = 0; + if (srcAsVec) { + srcSize = srcAsVec.getNumElements(); + } else { + return rewriter.notifyMatchFailure(insertOp, + "scalars are not supported."); + } + + auto dstShape = insertOp.getDestVectorType().getShape(); + const auto dstSize = insertOp.getDestVectorType().getNumElements(); + auto dstSizeForOffsets = dstSize; + + // compute linearized offset + int64_t linearizedOffset = 0; + auto offsetsNd = insertOp.getStaticPosition(); + for (auto [dim, offset] : llvm::enumerate(offsetsNd)) { + dstSizeForOffsets /= dstShape[dim]; + linearizedOffset += offset * dstSizeForOffsets; + } + + llvm::SmallVector indices(dstSize); + auto origValsUntil = indices.begin(); + std::advance(origValsUntil, linearizedOffset); + std::iota(indices.begin(), origValsUntil, + 0); // original values that remain [0, offset) + auto newValsUntil = origValsUntil; + std::advance(newValsUntil, srcSize); + std::iota(origValsUntil, newValsUntil, + dstSize); // new values [offset, offset+srcNumElements) + std::iota(newValsUntil, indices.end(), + linearizedOffset + srcSize); // the rest of original values + // [offset+srcNumElements, end) + + rewriter.replaceOpWithNewOp( + insertOp, dstTy, adaptor.getDest(), adaptor.getSource(), + rewriter.getI64ArrayAttr(indices)); + + return success(); + } + private: unsigned targetVectorBitWidth; }; @@ -410,6 +505,6 @@ void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns( : true; }); patterns.add( + LinearizeVectorInsert, LinearizeVectorExtractStridedSlice>( typeConverter, patterns.getContext(), targetBitWidth); } diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir index b29ceab5783d7a..31a59b809a74ba 100644 --- a/mlir/test/Dialect/Vector/linearize.mlir +++ b/mlir/test/Dialect/Vector/linearize.mlir @@ -245,3 +245,32 @@ func.func @test_vector_extract(%arg0: vector<2x8x2xf32>) -> vector<8x2xf32> { %0 = vector.extract %arg0[1]: vector<8x2xf32> from vector<2x8x2xf32> return %0 : vector<8x2xf32> } + +// ----- +// ALL-LABEL: test_vector_insert +// ALL-SAME: (%[[DEST:.*]]: vector<2x8x4xf32>, %[[SRC:.*]]: vector<8x4xf32>) -> vector<2x8x4xf32> { +func.func @test_vector_insert(%arg0: vector<2x8x4xf32>, %arg1: vector<8x4xf32>) -> vector<2x8x4xf32> { + // DEFAULT: %[[ARG_SRC:.*]] = vector.shape_cast %[[SRC]] : vector<8x4xf32> to vector<32xf32> + // DEFAULT: %[[ARG_DEST:.*]] = vector.shape_cast %[[DEST]] : vector<2x8x4xf32> to vector<64xf32> + // DEFAULT: %[[SHUFFLE:.*]] = vector.shuffle %[[ARG_DEST]], %[[ARG_SRC]] + // DEFAULT-SAME: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, + // DEFAULT-SAME: 88, 89, 90, 91, 92, 93, 94, 95, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + // DEFAULT-SAME: 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<64xf32>, vector<32xf32> + // DEFAULT: %[[RES:.*]] = vector.shape_cast %[[SHUFFLE]] : vector<64xf32> to vector<2x8x4xf32> + // DEFAULT: return %[[RES]] : vector<2x8x4xf32> + + // BW-128: %[[ARG_SRC:.*]] = vector.shape_cast %[[SRC]] : vector<8x4xf32> to vector<32xf32> + // BW-128: %[[ARG_DEST:.*]] = vector.shape_cast %[[DEST]] : vector<2x8x4xf32> to vector<64xf32> + // BW-128: %[[SHUFFLE:.*]] = vector.shuffle %[[ARG_DEST]], %[[ARG_SRC]] + // BW-128-SAME: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, + // BW-128-SAME: 88, 89, 90, 91, 92, 93, 94, 95, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + // BW-128-SAME: 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<64xf32>, vector<32xf32> + // BW-128: %[[RES:.*]] = vector.shape_cast %[[SHUFFLE]] : vector<64xf32> to vector<2x8x4xf32> + // BW-128: return %[[RES]] : vector<2x8x4xf32> + + // BW-0: %[[RES:.*]] = vector.insert %[[SRC]], %[[DEST]] [0] : vector<8x4xf32> into vector<2x8x4xf32> + // BW-0: return %[[RES]] : vector<2x8x4xf32> + + %0 = vector.insert %arg1, %arg0[0]: vector<8x4xf32> into vector<2x8x4xf32> + return %0 : vector<2x8x4xf32> +} From bdd4e8b1c011a6cf30171d365b58327a4e321ba0 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Tue, 28 May 2024 15:03:40 +0200 Subject: [PATCH 05/89] [bazel] Port 17ecd23f6932c87fcc8b2b8675762d50f3d53056 --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index d1a2c6f11d98a7..a67f20533ae220 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2232,7 +2232,7 @@ llvm_target_lib_list = [lib for lib in [ ("-gen-callingconv", "lib/Target/X86/X86GenCallingConv.inc"), ("-gen-subtarget", "lib/Target/X86/X86GenSubtargetInfo.inc"), ("-gen-x86-fold-tables -asmwriternum=1", "lib/Target/X86/X86GenFoldTables.inc"), - ("-gen-x86-compress-evex-tables", "lib/Target/X86/X86GenCompressEVEXTables.inc"), + ("-gen-x86-instr-mapping", "lib/Target/X86/X86GenInstrMapping.inc"), ("-gen-exegesis", "lib/Target/X86/X86GenExegesis.inc"), ("-gen-x86-mnemonic-tables -asmwriternum=1", "lib/Target/X86/X86GenMnemonicTables.inc"), ], From 5988c798de617cb35491c42de388b98b4c175421 Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Tue, 28 May 2024 21:08:17 +0800 Subject: [PATCH 06/89] [X86][tablgen] Add assertions when emitting NF transform table --- llvm/utils/TableGen/X86InstrMappingEmitter.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp index a8970d8bcbacdc..d89a1f078328b5 100644 --- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp +++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp @@ -277,8 +277,22 @@ void X86InstrMappingEmitter::emitNFTransformTable( if (Pos == std::string::npos) continue; - if (auto *NewRec = Records.getDef(Name.erase(Pos, 3))) + if (auto *NewRec = Records.getDef(Name.erase(Pos, 3))) { +#ifndef NDEBUG + auto ClobberEFLAGS = [](const Record *R) { + return llvm::any_of( + R->getValueAsListOfDefs("Defs"), + [](const Record *Def) { return Def->getName() == "EFLAGS"; }); + }; + if (ClobberEFLAGS(Rec)) + report_fatal_error("EFLAGS should not be clobbered by " + + Rec->getName()); + if (!ClobberEFLAGS(NewRec)) + report_fatal_error("EFLAGS should be clobbered by " + + NewRec->getName()); +#endif Table.push_back(std::pair(&Target.getInstruction(NewRec), Inst)); + } } printTable(Table, "X86NFTransformTable", "GET_X86_NF_TRANSFORM_TABLE", OS); } From 2c7c9df6ba3e86d7286476e875e215b64059c590 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 28 May 2024 09:15:00 -0400 Subject: [PATCH 07/89] [gn] port 17ecd23f6932 (-gen-x86-instr-mapping) --- llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn | 6 +++--- llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn index 78a9d20812ef9b..8264f6d73e791e 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn @@ -12,9 +12,9 @@ tablegen("X86GenDAGISel") { td_file = "X86.td" } -tablegen("X86GenCompressEVEXTables") { +tablegen("X86GenInstrMapping") { visibility = [ ":LLVMX86CodeGen" ] - args = [ "-gen-x86-compress-evex-tables" ] + args = [ "-gen-x86-instr-mapping" ] td_file = "X86.td" } @@ -48,11 +48,11 @@ tablegen("X86GenRegisterBank") { static_library("LLVMX86CodeGen") { deps = [ ":X86GenCallingConv", - ":X86GenCompressEVEXTables", ":X86GenDAGISel", ":X86GenFastISel", ":X86GenFoldTables", ":X86GenGlobalISel", + ":X86GenInstrMapping", ":X86GenRegisterBank", "MCTargetDesc", "TargetInfo", diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn index f3ae5b5899ac6a..2e11d25767cd00 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn @@ -64,7 +64,7 @@ executable("llvm-tblgen") { "SearchableTableEmitter.cpp", "SubtargetEmitter.cpp", "WebAssemblyDisassemblerEmitter.cpp", - "X86CompressEVEXTablesEmitter.cpp", + "X86InstrMappingEmitter.cpp", "X86DisassemblerTables.cpp", "X86FoldTablesEmitter.cpp", "X86MnemonicTables.cpp", From de327865c0e255bc799458ce34bc913f598b4261 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Tue, 28 May 2024 14:16:09 +0100 Subject: [PATCH 08/89] [flang][HLFIR][NFC] Reduce HLFIR to FIR conversion boilerplate (#93539) The pass constructor can be generated automatically. This pass is module-level and then runs on all relevant intrinsic operations inside of the module, no matter what top level operation they are inside of. --- flang/include/flang/Optimizer/HLFIR/Passes.h | 4 ---- flang/include/flang/Optimizer/HLFIR/Passes.td | 1 - flang/include/flang/Tools/CLOptions.inc | 2 +- flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp | 4 ---- 4 files changed, 1 insertion(+), 10 deletions(-) diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.h b/flang/include/flang/Optimizer/HLFIR/Passes.h index edefe36de00c16..83388d0527e192 100644 --- a/flang/include/flang/Optimizer/HLFIR/Passes.h +++ b/flang/include/flang/Optimizer/HLFIR/Passes.h @@ -20,10 +20,6 @@ namespace hlfir { #define GEN_PASS_DECL -#include "flang/Optimizer/HLFIR/Passes.h.inc" - -std::unique_ptr createConvertHLFIRtoFIRPass(); - #define GEN_PASS_REGISTRATION #include "flang/Optimizer/HLFIR/Passes.h.inc" } // namespace hlfir diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td index 1dd2e3dc81911f..ed49f5093c9652 100644 --- a/flang/include/flang/Optimizer/HLFIR/Passes.td +++ b/flang/include/flang/Optimizer/HLFIR/Passes.td @@ -12,7 +12,6 @@ include "mlir/Pass/PassBase.td" def ConvertHLFIRtoFIR : Pass<"convert-hlfir-to-fir", "::mlir::ModuleOp"> { let summary = "Lower High-Level FIR to FIR"; - let constructor = "hlfir::createConvertHLFIRtoFIRPass()"; let dependentDialects = [ "mlir::func::FuncDialect", ]; diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index bb3c90ebc04d44..61ea7a7f9bbdd2 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -331,7 +331,7 @@ inline void createHLFIRToFIRPassPipeline( pm.addPass(hlfir::createLowerHLFIROrderedAssignments()); pm.addPass(hlfir::createLowerHLFIRIntrinsics()); pm.addPass(hlfir::createBufferizeHLFIR()); - pm.addPass(hlfir::createConvertHLFIRtoFIRPass()); + pm.addPass(hlfir::createConvertHLFIRtoFIR()); } /// Create a pass pipeline for handling certain OpenMP transformations needed diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp index e56595d1c8e232..b8823bfa59f8f2 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp @@ -789,7 +789,3 @@ class ConvertHLFIRtoFIR }; } // namespace - -std::unique_ptr hlfir::createConvertHLFIRtoFIRPass() { - return std::make_unique(); -} From 44861c7ac563f9e994305e22f2dca1c4f37265e4 Mon Sep 17 00:00:00 2001 From: Sayan Saha Date: Tue, 28 May 2024 09:21:46 -0400 Subject: [PATCH 09/89] [mlir] [linalg] Check for dim shape to decide unit dim for each operand in dropUnitDims pass. (#93317) `mlir-opt --linalg-fold-unit-extent-dims` pass on the following IR ``` #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> module { func.func @main(%arg0: tensor<1x?x?x1xf32>, %arg1: index) -> tensor { %cst = arith.constant dense<1.000000e+00> : tensor<1x1x1x1xf32> %0 = tensor.empty(%arg1) : tensor %1 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %cst : tensor<1x?x?x1xf32>, tensor<1x1x1x1xf32>) outs(%0 : tensor) { ^bb0(%in: f32, %in_0: f32, %out: f32): %2 = arith.mulf %in, %in_0 : f32 %3 = arith.addf %out, %2 : f32 linalg.yield %3 : f32 } -> tensor return %1 : tensor } } ``` produces an incorrect tensor.expand_shape operation: ``` error: 'tensor.expand_shape' op expected dimension 0 of collapsed type to be dynamic since one or more of the corresponding dimensions in the expanded type is dynamic %1 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %cst : tensor<1x?x?x1xf32>, tensor<1x1x1x1xf32>) outs(%0 : tensor) { ^ /mathworks/devel/sandbox/sayans/geckWorks/g3294570/repro.mlir:8:10: note: see current operation: %5 = "tensor.expand_shape"(%4) <{reassociation = [[0, 1, 2, 3]]}> : (tensor<61xf32>) -> tensor // -----// IR Dump After LinalgFoldUnitExtentDimsPass Failed (linalg-fold-unit-extent-dims) //----- // #map = affine_map<(d0) -> (0, d0)> #map1 = affine_map<(d0) -> ()> #map2 = affine_map<(d0) -> (d0)> "builtin.module"() ({ "func.func"() <{function_type = (tensor<1x?x?x1xf32>, index) -> tensor, sym_name = "main"}> ({ ^bb0(%arg0: tensor<1x?x?x1xf32>, %arg1: index): %0 = "arith.constant"() <{value = dense<1.000000e+00> : tensor}> : () -> tensor %1 = "tensor.collapse_shape"(%arg0) <{reassociation = [[0, 1], [2, 3]]}> : (tensor<1x?x?x1xf32>) -> tensor %2 = "tensor.empty"() : () -> tensor<61xf32> %3 = "tensor.empty"() : () -> tensor<61xf32> %4 = "linalg.generic"(%1, %0, %2, %3) <{indexing_maps = [#map, #map1, #map2, #map2], iterator_types = [#linalg.iterator_type], operandSegmentSizes = array}> ({ ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32): %6 = "arith.mulf"(%arg2, %arg3) <{fastmath = #arith.fastmath}> : (f32, f32) -> f32 %7 = "arith.addf"(%arg4, %6) <{fastmath = #arith.fastmath}> : (f32, f32) -> f32 "linalg.yield"(%7) : (f32) -> () }) : (tensor, tensor, tensor<61xf32>, tensor<61xf32>) -> tensor<61xf32> %5 = "tensor.expand_shape"(%4) <{reassociation = [[0, 1, 2, 3]]}> : (tensor<61xf32>) -> tensor "func.return"(%5) : (tensor) -> () }) : () -> () }) : () -> () ``` The reason of this is because the dimension `d0` is determined to be an unit-dim that can be dropped based on the dimensions of operand `arg0` to `linalg.generic`. Later on when iterating over operand `outs` the dimension `d0` is determined to be an unit-dim even though the shape corresponding to it is `Shape::kDynamic`. For the `linalg.generic` to be valid `d0` of `outs` does need to be `1` but that isn't properly processed in the current implementation and the dimension is dropped resulting in `outs` operand to be `tensor<61xf32>` in the example. The fix is to also check that the dimension shape is actually `1` before dropping the dimension. The IR after the fix is: ``` #map = affine_map<()[s0, s1] -> (s0 * s1)> #map1 = affine_map<(d0) -> (0, d0)> #map2 = affine_map<(d0) -> ()> module { func.func @main(%arg0: tensor<1x?x?x1xf32>, %arg1: index) -> tensor { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant dense<1.000000e+00> : tensor %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor %0 = tensor.empty(%arg1) : tensor %1 = affine.apply #map()[%arg1, %c1] %2 = tensor.empty(%1) : tensor %3 = linalg.generic {indexing_maps = [#map1, #map2, #map1, #map1], iterator_types = ["parallel"]} ins(%collapsed, %cst, %0 : tensor, tensor, tensor) outs(%2 : tensor) { ^bb0(%in: f32, %in_0: f32, %in_1: f32, %out: f32): %4 = arith.mulf %in, %in_0 : f32 %5 = arith.addf %in_1, %4 : f32 linalg.yield %5 : f32 } -> tensor %expanded = tensor.expand_shape %3 [[0, 1], [2, 3]] output_shape [%c0, 1, 61, 1] : tensor into tensor return %expanded : tensor } } ``` --- .../Linalg/Transforms/DropUnitDims.cpp | 3 +- .../Dialect/Linalg/drop-unit-extent-dims.mlir | 43 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp index 65efa18af18f65..c0829397f1f851 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @@ -351,7 +351,8 @@ static UnitExtentReplacementInfo dropUnitExtentFromOperandMetadata( auto isUnitDim = [&](unsigned dim) { if (auto dimExpr = dyn_cast(exprs[dim])) { unsigned oldPosition = dimExpr.getPosition(); - return !oldDimsToNewDimsMap.count(oldPosition); + return !oldDimsToNewDimsMap.count(oldPosition) && + (operandShape[dim] == 1); } // Handle the other case where the shape is 1, and is accessed using a // constant 0. diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir index a9cbaaf7fdc485..8f9b12880adcf7 100644 --- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir +++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir @@ -1087,3 +1087,46 @@ func.func @drop_known_unit_constant_low_high(%arg0: tensor<1x383x128xf32>) -> te // CHECK: } : tensor<383x128xf32> to tensor<384x128xf32> // CHECK: tensor.expand_shape %[[PADDED]] // CHECK-SAME: {{\[}}[0, 1], [2]] output_shape [1, 384, 128] : tensor<384x128xf32> into tensor<1x384x128xf32> + +// ----- + +// CHECK: #[[$MAP0:.+]] = affine_map<()[s0, s1] -> (s0 * s1)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (0, d0)> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> ()> + +// CHECK-LABEL: func @drop_unit_dim_corresponding_to_dynamic_dim +// CHECK-SAME: %[[ARG0:.*]]: tensor<1x?x?x1xf32>, +// CHECK-SAME: %[[ARG1:.*]]: index) -> tensor { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_2:.*]] = arith.constant dense<1.000000e+00> : tensor +// CHECK: %[[VAL_3:.*]] = tensor.collapse_shape %[[ARG0]] {{\[\[}}0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor +// CHECK: %[[VAL_4:.*]] = tensor.empty(%[[ARG1]]) : tensor +// CHECK: %[[VAL_5:.*]] = affine.apply #[[$MAP0]](){{\[}}%[[ARG1]], %[[VAL_1]]] +// CHECK: %[[VAL_6:.*]] = tensor.empty(%[[VAL_5]]) : tensor +// CHECK: %[[VAL_7:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel"]} ins(%[[VAL_3]], %[[VAL_2]], %[[VAL_4]] : tensor, tensor, tensor) outs(%[[VAL_6]] : tensor) { +// CHECK: ^bb0(%[[VAL_8:.*]]: f32, %[[VAL_9:.*]]: f32, %[[VAL_10:.*]]: f32, %[[VAL_11:.*]]: f32): +// CHECK: %[[VAL_12:.*]] = arith.mulf %[[VAL_8]], %[[VAL_9]] : f32 +// CHECK: %[[VAL_13:.*]] = arith.addf %[[VAL_10]], %[[VAL_12]] : f32 +// CHECK: linalg.yield %[[VAL_13]] : f32 +// CHECK: } -> tensor +// CHECK: %[[VAL_14:.*]] = tensor.expand_shape %[[VAL_7]] {{\[\[}}0, 1], [2, 3]] output_shape {{\[}}%[[VAL_0]], 1, 61, 1] : tensor into tensor +// CHECK: return %[[VAL_14]] : tensor +// CHECK: } + +#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> +module { + func.func @drop_unit_dim_corresponding_to_dynamic_dim(%arg0: tensor<1x?x?x1xf32>, %arg1: index) -> tensor { + %cst = arith.constant dense<1.000000e+00> : tensor<1x1x1x1xf32> + %0 = tensor.empty(%arg1) : tensor + %1 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %cst : tensor<1x?x?x1xf32>, tensor<1x1x1x1xf32>) outs(%0 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %2 = arith.mulf %in, %in_0 : f32 + %3 = arith.addf %out, %2 : f32 + linalg.yield %3 : f32 + } -> tensor + return %1 : tensor + } +} From 24a12a9c85b1ec08ff597f43e3414271d8439a97 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 28 May 2024 09:22:55 -0400 Subject: [PATCH 10/89] [clang] Diagnose problematic diagnostic messages (#93229) Clang has some unwritten rules about diagnostic wording regarding things like punctuation and capitalization. This patch documents those rules and adds some tablegen support for checking diagnostics follow the rules. Specifically: tablegen now checks that a diagnostic does not start with a capital letter or end with punctuation, except for the usual exceptions like proper nouns or ending with a question. Now that the code base is clean of such issues, the diagnostics are emitted as an error rather than a warning to ensure that failure to follow these rules is either addressed by an author, or a new exception is added to the checking logic. --- clang/docs/InternalsManual.rst | 38 ++++ clang/test/TableGen/deferred-diag.td | 10 +- clang/test/TableGen/text-substitution.td | 4 +- clang/test/TableGen/wording-errors.td | 55 +++++ .../TableGen/ClangDiagnosticsEmitter.cpp | 194 ++++++++++++++++++ 5 files changed, 294 insertions(+), 7 deletions(-) create mode 100644 clang/test/TableGen/wording-errors.td diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst index b3e2b870ae5f9a..3d21e37784b363 100644 --- a/clang/docs/InternalsManual.rst +++ b/clang/docs/InternalsManual.rst @@ -123,6 +123,44 @@ severe that error recovery won't be able to recover sensibly from them (thus spewing a ton of bogus errors). One example of this class of error are failure to ``#include`` a file. +Diagnostic Wording +^^^^^^^^^^^^^^^^^^ +The wording used for a diagnostic is critical because it is the only way for a +user to know how to correct their code. Use the following suggestions when +wording a diagnostic. + +* Diagnostics in Clang do not start with a capital letter and do not end with + punctuation. + + * This does not apply to proper nouns like ``Clang`` or ``OpenMP``, to + acronyms like ``GCC`` or ``ARC``, or to language standards like ``C23`` + or ``C++17``. + * A trailing question mark is allowed. e.g., ``unknown identifier %0; did + you mean %1?``. + +* Appropriately capitalize proper nouns like ``Clang``, ``OpenCL``, ``GCC``, + ``Objective-C``, etc and language standard versions like ``C11`` or ``C++11``. +* The wording should be succinct. If necessary, use a semicolon to combine + sentence fragments instead of using complete sentences. e.g., prefer wording + like ``'%0' is deprecated; it will be removed in a future release of Clang`` + over wording like ``'%0' is deprecated. It will be removed in a future release + of Clang``. +* The wording should be actionable and avoid using standards terms or grammar + productions that a new user would not be familiar with. e.g., prefer wording + like ``missing semicolon`` over wording like ``syntax error`` (which is not + actionable) or ``expected unqualified-id`` (which uses standards terminology). +* The wording should clearly explain what is wrong with the code rather than + restating what the code does. e.g., prefer wording like ``type %0 requires a + value in the range %1 to %2`` over wording like ``%0 is invalid``. +* The wording should have enough contextual information to help the user + identify the issue in a complex expression. e.g., prefer wording like + ``both sides of the %0 binary operator are identical`` over wording like + ``identical operands to binary operator``. +* Use single quotes to denote syntactic constructs or command line arguments + named in a diagnostic message. e.g., prefer wording like ``'this' pointer + cannot be null in well-defined C++ code`` over wording like ``this pointer + cannot be null in well-defined C++ code``. + The Format String ^^^^^^^^^^^^^^^^^ diff --git a/clang/test/TableGen/deferred-diag.td b/clang/test/TableGen/deferred-diag.td index c1906d4a9e45ec..d7e8e694c7b3e4 100644 --- a/clang/test/TableGen/deferred-diag.td +++ b/clang/test/TableGen/deferred-diag.td @@ -4,24 +4,24 @@ include "DiagnosticBase.inc" // Test usage of Deferrable and NonDeferrable in diagnostics. -def test_default : Error<"This error is non-deferrable by default">; +def test_default : Error<"this error is non-deferrable by default">; // CHECK-DAG: DIAG(test_default, {{.*}}SFINAE_SubstitutionFailure, false, true, true, false, 0) -def test_deferrable : Error<"This error is deferrable">, Deferrable; +def test_deferrable : Error<"this error is deferrable">, Deferrable; // CHECK-DAG: DIAG(test_deferrable, {{.*}} SFINAE_SubstitutionFailure, false, true, true, true, 0) -def test_non_deferrable : Error<"This error is non-deferrable">, NonDeferrable; +def test_non_deferrable : Error<"this error is non-deferrable">, NonDeferrable; // CHECK-DAG: DIAG(test_non_deferrable, {{.*}} SFINAE_SubstitutionFailure, false, true, true, false, 0) let Deferrable = 1 in { -def test_let : Error<"This error is deferrable by let">; +def test_let : Error<"this error is deferrable by let">; // CHECK-DAG: DIAG(test_let, {{.*}} SFINAE_SubstitutionFailure, false, true, true, true, 0) // Make sure TextSubstitution is allowed in the let Deferrable block. def textsub : TextSubstitution<"%select{text1|text2}0">; -def test_let2 : Error<"This error is deferrable by let %sub{textsub}0">; +def test_let2 : Error<"this error is deferrable by let %sub{textsub}0">; // CHECK-DAG: DIAG(test_let2, {{.*}} SFINAE_SubstitutionFailure, false, true, true, true, 0) } diff --git a/clang/test/TableGen/text-substitution.td b/clang/test/TableGen/text-substitution.td index aafdbe48c43bec..b0d030aca65134 100644 --- a/clang/test/TableGen/text-substitution.td +++ b/clang/test/TableGen/text-substitution.td @@ -26,8 +26,8 @@ def sub_test_rewrite : TextSubstitution< // CHECK-SAME: Q! %q1. // CHECK-SAME: PLACEHOLDER! %0.OBJCCLASS! // CHECK-SAME: %objcclass5. OBJCINSTANCE! -// CHECK-SAME: %objcinstance4. DONE!", -def test_rewrite: Error<"%sub{sub_test_rewrite}5,4,3,2,1,0 DONE!">; +// CHECK-SAME: %objcinstance4. DONE", +def test_rewrite: Error<"%sub{sub_test_rewrite}5,4,3,2,1,0 DONE">; def test_sub_basic : Error<"%sub{yes_no}0">; // CHECK: test_sub_basic diff --git a/clang/test/TableGen/wording-errors.td b/clang/test/TableGen/wording-errors.td new file mode 100644 index 00000000000000..eb5eb2f547c782 --- /dev/null +++ b/clang/test/TableGen/wording-errors.td @@ -0,0 +1,55 @@ +// RUN: not clang-tblgen -gen-clang-diags-defs -I%S %s -o /dev/null 2>&1 | FileCheck %s +include "DiagnosticBase.inc" + +// Ensure we catch a capital letter at the start of a diagnostic. +def zero : Error< + "This is bad">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'This' is invalid + +// Test that we also correctly handle selections. +def one : Error< + "%select{|or}0 That">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'That' is invalid +def two : Error< + "%select{as does|}0 This">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'This' is invalid +def three : Error< + "%select{and||of course}0 Whatever">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'Whatever' is invalid + +// Test that we accept the following cases. +def four : Error< + "this is fine">; +def five : Error< + "%select{this|is|also}0 Fine">; +def six : Error< + "%select{this|is|also|}0 fine">; +def seven : Error< + "%select{ARC|C|C23|C++14|OpenMP}0 are also fine">; + +// Next, test that we catch punctuation at the end of the diagnostic. +def eight : Error< + "punctuation is bad.">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid +def nine : Error< + "it's really bad!">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '!' is invalid +def ten : Error< + "we also catch %select{punctuation.|in select}0">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid +def eleven : Error< + "and %select{|here.}0">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid +def twelve : Error< + "and %select{here.|}0">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid +def thirteen : Error< + "and even %select{|here.|}0">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid +def fourteen : Error< + "and %select{here}0.">; +// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid + +// Test that we accept the following cases. +def fifteen : Error< + "question marks are intentionally okay?">; diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp index f564689fff7cf1..b290530444d2ab 100644 --- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp +++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp @@ -1213,6 +1213,197 @@ static bool isRemark(const Record &Diag) { return ClsName == "CLASS_REMARK"; } +// Presumes the text has been split at the first whitespace or hyphen. +static bool isExemptAtStart(StringRef Text) { + // Fast path, the first character is lowercase or not alphanumeric. + if (Text.empty() || isLower(Text[0]) || !isAlnum(Text[0])) + return true; + + // If the text is all uppercase (or numbers, +, or _), then we assume it's an + // acronym and that's allowed. This covers cases like ISO, C23, C++14, and + // OBJECT_MODE. However, if there's only a single letter other than "C", we + // do not exempt it so that we catch a case like "A really bad idea" while + // still allowing a case like "C does not allow...". + if (llvm::all_of(Text, [](char C) { + return isUpper(C) || isDigit(C) || C == '+' || C == '_'; + })) + return Text.size() > 1 || Text[0] == 'C'; + + // Otherwise, there are a few other exemptions. + return StringSwitch(Text) + .Case("AddressSanitizer", true) + .Case("CFString", true) + .Case("Clang", true) + .Case("Fuchsia", true) + .Case("GNUstep", true) + .Case("IBOutletCollection", true) + .Case("Microsoft", true) + .Case("Neon", true) + .StartsWith("NSInvocation", true) // NSInvocation, NSInvocation's + .Case("Objective", true) // Objective-C (hyphen is a word boundary) + .Case("OpenACC", true) + .Case("OpenCL", true) + .Case("OpenMP", true) + .Case("Pascal", true) + .Case("Swift", true) + .Case("Unicode", true) + .Case("Vulkan", true) + .Case("WebAssembly", true) + .Default(false); +} + +// Does not presume the text has been split at all. +static bool isExemptAtEnd(StringRef Text) { + // Rather than come up with a list of characters that are allowed, we go the + // other way and look only for characters that are not allowed. + switch (Text.back()) { + default: + return true; + case '?': + // Explicitly allowed to support "; did you mean?". + return true; + case '.': + case '!': + return false; + } +} + +static void verifyDiagnosticWording(const Record &Diag) { + StringRef FullDiagText = Diag.getValueAsString("Summary"); + + auto DiagnoseStart = [&](StringRef Text) { + // Verify that the text does not start with a capital letter, except for + // special cases that are exempt like ISO and C++. Find the first word + // by looking for a word breaking character. + char Separators[] = {' ', '-', ',', '}'}; + auto Iter = std::find_first_of( + Text.begin(), Text.end(), std::begin(Separators), std::end(Separators)); + + StringRef First = Text.substr(0, Iter - Text.begin()); + if (!isExemptAtStart(First)) { + PrintError(&Diag, + "Diagnostics should not start with a capital letter; '" + + First + "' is invalid"); + } + }; + + auto DiagnoseEnd = [&](StringRef Text) { + // Verify that the text does not end with punctuation like '.' or '!'. + if (!isExemptAtEnd(Text)) { + PrintError(&Diag, "Diagnostics should not end with punctuation; '" + + Text.substr(Text.size() - 1, 1) + "' is invalid"); + } + }; + + // If the diagnostic starts with %select, look through it to see whether any + // of the options will cause a problem. + if (FullDiagText.starts_with("%select{")) { + // Do a balanced delimiter scan from the start of the text to find the + // closing '}', skipping intermediary {} pairs. + + size_t BraceCount = 1; + constexpr size_t PercentSelectBraceLen = sizeof("%select{") - 1; + auto Iter = FullDiagText.begin() + PercentSelectBraceLen; + for (auto End = FullDiagText.end(); Iter != End; ++Iter) { + char Ch = *Iter; + if (Ch == '{') + ++BraceCount; + else if (Ch == '}') + --BraceCount; + if (!BraceCount) + break; + } + // Defending against a malformed diagnostic string. + if (BraceCount != 0) + return; + + StringRef SelectText = + FullDiagText.substr(PercentSelectBraceLen, Iter - FullDiagText.begin() - + PercentSelectBraceLen); + SmallVector SelectPieces; + SelectText.split(SelectPieces, '|'); + + // Walk over all of the individual pieces of select text to see if any of + // them start with an invalid character. If any of the select pieces is + // empty, we need to look at the first word after the %select to see + // whether that is invalid or not. If all of the pieces are fine, then we + // don't need to check anything else about the start of the diagnostic. + bool CheckSecondWord = false; + for (StringRef Piece : SelectPieces) { + if (Piece.empty()) + CheckSecondWord = true; + else + DiagnoseStart(Piece); + } + + if (CheckSecondWord) { + // There was an empty select piece, so we need to check the second + // word. This catches situations like '%select{|fine}0 Not okay'. Add + // two to account for the closing curly brace and the number after it. + StringRef AfterSelect = + FullDiagText.substr(Iter - FullDiagText.begin() + 2).ltrim(); + DiagnoseStart(AfterSelect); + } + } else { + // If the start of the diagnostic is not %select, we can check the first + // word and be done with it. + DiagnoseStart(FullDiagText); + } + + // If the last character in the diagnostic is a number preceded by a }, scan + // backwards to see if this is for a %select{...}0. If it is, we need to look + // at each piece to see whether it ends in punctuation or not. + bool StillNeedToDiagEnd = true; + if (isDigit(FullDiagText.back()) && *(FullDiagText.end() - 2) == '}') { + // Scan backwards to find the opening curly brace. + size_t BraceCount = 1; + auto Iter = FullDiagText.end() - sizeof("}0"); + for (auto End = FullDiagText.begin(); Iter != End; --Iter) { + char Ch = *Iter; + if (Ch == '}') + ++BraceCount; + else if (Ch == '{') + --BraceCount; + if (!BraceCount) + break; + } + // Defending against a malformed diagnostic string. + if (BraceCount != 0) + return; + + // Continue the backwards scan to find the word before the '{' to see if it + // is 'select'. + constexpr size_t SelectLen = sizeof("select") - 1; + bool IsSelect = + (FullDiagText.substr(Iter - SelectLen - FullDiagText.begin(), + SelectLen) == "select"); + if (IsSelect) { + // Gather the content between the {} for the select in question so we can + // split it into pieces. + StillNeedToDiagEnd = false; // No longer need to handle the end. + StringRef SelectText = + FullDiagText.substr(Iter - FullDiagText.begin() + /*{*/ 1, + FullDiagText.end() - Iter - /*pos before }0*/ 3); + SmallVector SelectPieces; + SelectText.split(SelectPieces, '|'); + for (StringRef Piece : SelectPieces) { + // Not worrying about a situation like: "this is bar. %select{foo|}0". + if (!Piece.empty()) + DiagnoseEnd(Piece); + } + } + } + + // If we didn't already cover the diagnostic because of a %select, handle it + // now. + if (StillNeedToDiagEnd) + DiagnoseEnd(FullDiagText); + + // FIXME: This could also be improved by looking for instances of clang or + // gcc in the diagnostic and recommend Clang or GCC instead. However, this + // runs into odd situations like [[clang::warn_unused_result]], + // #pragma clang, or --unwindlib=libgcc. +} /// ClangDiagsDefsEmitter - The top-level class emits .def files containing /// declarations of Clang diagnostics. @@ -1273,6 +1464,9 @@ void clang::EmitClangDiagsDefs(RecordKeeper &Records, raw_ostream &OS, if (!Component.empty() && Component != R.getValueAsString("Component")) continue; + // Validate diagnostic wording for common issues. + verifyDiagnosticWording(R); + OS << "DIAG(" << R.getName() << ", "; OS << R.getValueAsDef("Class")->getName(); OS << ", (unsigned)diag::Severity::" From 6e1a04247d6cc3295be8e3b14286f95983632e1c Mon Sep 17 00:00:00 2001 From: Tyker Date: Tue, 28 May 2024 15:21:56 +0200 Subject: [PATCH 11/89] Fix failure after d46e37348ec3f8054b10bcbbe7c11149d7f61031 --- llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir | 8 ++++---- llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir | 11 +++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir index d8f2b08adaf2fb..dc20a1577aa5bc 100644 --- a/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir +++ b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir @@ -3,16 +3,16 @@ # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ # RUN: -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ -# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=0,ppc-xtoi-peephole-count=8 \ +# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=0-7 \ # RUN: | FileCheck %s --check-prefix=ALL # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ -# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=2 \ +# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=3-4 \ # RUN: | FileCheck %s --check-prefix=ONE-FIRSTSTORE # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ -# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=5,ppc-xtoi-peephole-count=2 \ +# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=5-6 \ # RUN: | FileCheck %s --check-prefix=ONE-SECONDSTORE # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ -# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=4 \ +# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=3-6 \ # RUN: | FileCheck %s --check-prefix=TWO --- diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir index cf3ff291e26c6a..09f7ededa20c64 100644 --- a/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir +++ b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir @@ -3,16 +3,19 @@ # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ # RUN: -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ -# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=0,ppc-per-op-peephole-count=6 \ +# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=0-5 \ # RUN: | FileCheck %s --check-prefix=ALL # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ -# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=1 \ +# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=0-5 \ +# RUN: | FileCheck %s --check-prefix=ALL +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ +# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=3 \ # RUN: | FileCheck %s --check-prefix=ONE-FIRST-RLWINM # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ -# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=4,ppc-per-op-peephole-count=1 \ +# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=4 \ # RUN: | FileCheck %s --check-prefix=ONE-SECOND-RLWINM # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \ -# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=2 \ +# RUN: -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=3-4 \ # RUN: | FileCheck %s --check-prefix=TWO --- From 8995ccc4460ed8a90dcc9bd023743a8f59458f50 Mon Sep 17 00:00:00 2001 From: Xu Zhang Date: Tue, 28 May 2024 21:29:31 +0800 Subject: [PATCH 12/89] [Clang] Add support for [[msvc::noinline]] attribute. (#91720) Fixes #90941. Add support for ``[[msvc::noinline]]`` attribute, which is actually an alias of ``[[clang::noinline]]``. --- clang/include/clang/Basic/Attr.td | 7 ++-- clang/lib/Sema/SemaStmtAttr.cpp | 2 +- clang/test/CodeGen/attr-noinline.cpp | 32 ++++++++++++++++++ clang/test/Sema/attr-noinline.cpp | 50 ++++++++++++++++++++++++++-- 4 files changed, 86 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index e59cccccdd3690..ef9df1e9d8b4aa 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2025,9 +2025,12 @@ def Convergent : InheritableAttr { def NoInline : DeclOrStmtAttr { let Spellings = [CustomKeyword<"__noinline__">, GCC<"noinline">, CXX11<"clang", "noinline">, C23<"clang", "noinline">, + CXX11<"msvc", "noinline">, C23<"msvc", "noinline">, Declspec<"noinline">]; - let Accessors = [Accessor<"isClangNoInline", [CXX11<"clang", "noinline">, - C23<"clang", "noinline">]>]; + let Accessors = [Accessor<"isStmtNoInline", [CXX11<"clang", "noinline">, + C23<"clang", "noinline">, + CXX11<"msvc", "noinline">, + C23<"msvc", "noinline">]>]; let Documentation = [NoInlineDocs]; let Subjects = SubjectList<[Function, Stmt], WarnDiag, "functions and statements">; diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index 8735d96c840793..82373fe96a8243 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -285,7 +285,7 @@ bool Sema::CheckAlwaysInlineAttr(const Stmt *OrigSt, const Stmt *CurSt, static Attr *handleNoInlineAttr(Sema &S, Stmt *St, const ParsedAttr &A, SourceRange Range) { NoInlineAttr NIA(S.Context, A); - if (!NIA.isClangNoInline()) { + if (!NIA.isStmtNoInline()) { S.Diag(St->getBeginLoc(), diag::warn_function_attribute_ignored_in_stmt) << "[[clang::noinline]]"; return nullptr; diff --git a/clang/test/CodeGen/attr-noinline.cpp b/clang/test/CodeGen/attr-noinline.cpp index f0588cfecf4631..c1fb9941b5251d 100644 --- a/clang/test/CodeGen/attr-noinline.cpp +++ b/clang/test/CodeGen/attr-noinline.cpp @@ -9,6 +9,7 @@ static int baz(int x) { } [[clang::noinline]] bool noi() { } +[[msvc::noinline]] bool ms_noi() { return true; } void foo(int i) { [[clang::noinline]] bar(); @@ -39,6 +40,31 @@ void foo(int i) { // CHECK: call noundef zeroext i1 @_Z3barv() } +void ms_noi_check(int i) { + [[msvc::noinline]] bar(); +// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR:[0-9]+]] + [[msvc::noinline]] i = baz(i); +// CHECK: call noundef i32 @_ZL3bazi({{.*}}) #[[NOINLINEATTR]] + [[msvc::noinline]] (i = 4, bar()); +// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]] + [[msvc::noinline]] (void)(bar()); +// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]] + [[msvc::noinline]] f(bar(), bar()); +// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]] +// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]] +// CHECK: call void @_Z1fbb({{.*}}) #[[NOINLINEATTR]] + [[msvc::noinline]] [] { bar(); bar(); }(); // noinline only applies to the anonymous function call +// CHECK: call void @"_ZZ12ms_noi_checkiENK3$_0clEv"(ptr {{[^,]*}} %ref.tmp) #[[NOINLINEATTR]] + [[msvc::noinline]] for (bar(); bar(); bar()) {} +// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]] +// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]] +// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]] + [[msvc::noinline]] ms_noi(); +// CHECK: call noundef zeroext i1 @_Z6ms_noiv() + ms_noi(); +// CHECK: call noundef zeroext i1 @_Z6ms_noiv() +} + struct S { friend bool operator==(const S &LHS, const S &RHS); }; @@ -50,6 +76,12 @@ void func(const S &s1, const S &s2) { bool b; [[clang::noinline]] b = s1 == s2; // CHECK: call noundef zeroext i1 @_ZeqRK1SS1_({{.*}}) #[[NOINLINEATTR]] + + [[msvc::noinline]]g(s1 == s2); +// CHECK: call noundef zeroext i1 @_ZeqRK1SS1_({{.*}}) #[[NOINLINEATTR]] +// CHECK: call void @_Z1gb({{.*}}) #[[NOINLINEATTR]] + [[msvc::noinline]] b = s1 == s2; +// CHECK: call noundef zeroext i1 @_ZeqRK1SS1_({{.*}}) #[[NOINLINEATTR]] } // CHECK: attributes #[[NOINLINEATTR]] = { noinline } diff --git a/clang/test/Sema/attr-noinline.cpp b/clang/test/Sema/attr-noinline.cpp index bd6505b9fe98ef..6da0e873af1b6a 100644 --- a/clang/test/Sema/attr-noinline.cpp +++ b/clang/test/Sema/attr-noinline.cpp @@ -2,9 +2,9 @@ int bar(); -// expected-note@+1{{conflicting attribute is here}} +// expected-note@+1 2 {{conflicting attribute is here}} [[gnu::always_inline]] void always_inline_fn(void) { } -// expected-note@+1{{conflicting attribute is here}} +// expected-note@+1 2 {{conflicting attribute is here}} [[gnu::flatten]] void flatten_fn(void) { } [[gnu::noinline]] void noinline_fn(void) { } @@ -25,7 +25,21 @@ void foo() { __attribute__((noinline)) bar(); // expected-warning {{attribute is ignored on this statement as it only applies to functions; use '[[clang::noinline]]' on statements}} } +void ms_noi_check() { + [[msvc::noinline]] bar(); + [[msvc::noinline(0)]] bar(); // expected-error {{'noinline' attribute takes no arguments}} + int x; + [[msvc::noinline]] x = 0; // expected-warning {{'noinline' attribute is ignored because there exists no call expression inside the statement}} + [[msvc::noinline]] { asm("nop"); } // expected-warning {{'noinline' attribute is ignored because there exists no call expression inside the statement}} + [[msvc::noinline]] label: x = 1; // expected-warning {{'noinline' attribute only applies to functions and statements}} + + [[msvc::noinline]] always_inline_fn(); // expected-warning {{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}} + [[msvc::noinline]] flatten_fn(); // expected-warning {{statement attribute 'noinline' has higher precedence than function attribute 'flatten'}} + [[msvc::noinline]] noinline_fn(); +} + [[clang::noinline]] static int i = bar(); // expected-warning {{'noinline' attribute only applies to functions and statements}} +[[msvc::noinline]] static int j = bar(); // expected-warning {{'noinline' attribute only applies to functions and statements}} // This used to crash the compiler. template @@ -69,7 +83,39 @@ int variadic_baz(int x) { [[clang::noinline]] return non_dependent(x) + (dependent(x) + ...); } +template [[clang::always_inline]] +int qux(int x) { // #QUX + // expected-warning@+2{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}} + // expected-note@#NO_DEP{{conflicting attribute is here}} + [[msvc::noinline]] non_dependent(x); + if constexpr (D>0) { + // expected-warning@+6{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}} + // expected-note@#NO_DEP{{conflicting attribute is here}} + // expected-warning@+4 3{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}} + // expected-note@#QUX 3{{conflicting attribute is here}} + // expected-note@#QUX_INST 3{{in instantiation}} + // expected-note@+1 3{{in instantiation}} + [[msvc::noinline]] return non_dependent(x), qux(x + 1); + } + return x; +} + +// We can't suppress if there is a variadic involved. +template +int variadic_qux(int x) { + // Diagnoses NO_DEP 2x, once during phase 1, the second during instantiation. + // Dianoses DEP 3x, once per variadic expansion. + // expected-warning@+5 2{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}} + // expected-note@#NO_DEP 2{{conflicting attribute is here}} + // expected-warning@+3 3{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}} + // expected-note@#DEP 3{{conflicting attribute is here}} + // expected-note@#QUX_VARIADIC_INST{{in instantiation}} + [[msvc::noinline]] return non_dependent(x) + (dependent(x) + ...); +} + void use() { baz<3>(0); // #BAZ_INST variadic_baz<0, 1, 2>(0); // #VARIADIC_INST + qux<3>(0); // #QUX_INST + variadic_qux<0, 1, 2>(0); // #QUX_VARIADIC_INST } From 2ace7bdcfe640c69bd4dcf508d39485e0ef7ea06 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Tue, 28 May 2024 15:38:02 +0200 Subject: [PATCH 13/89] [Clang] allow `` `@$ `` in raw string delimiters in C++26 (#93216) And as an extension in older language modes. Per https://eel.is/c++draft/lex.string#nt:d-char Fixes #93130 --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Basic/CharInfo.h | 15 +++++++------- .../include/clang/Basic/DiagnosticLexKinds.td | 8 ++++++++ clang/lib/Basic/CharInfo.cpp | 20 +++++++++---------- clang/lib/Lex/Lexer.cpp | 11 +++++++++- clang/test/Lexer/cxx2c-raw-strings.cpp | 12 +++++++++++ 6 files changed, 49 insertions(+), 18 deletions(-) create mode 100644 clang/test/Lexer/cxx2c-raw-strings.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 182f8b5824258e..6b746cda53c71b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -802,6 +802,7 @@ Bug Fixes to C++ Support - Fixed a regression introduced in Clang 18 causing a static function overloading a non-static function with the same parameters not to be diagnosed. (Fixes #GH93456). - Clang now diagnoses unexpanded parameter packs in attributes. (Fixes #GH93269). +- Clang now allows ``@$``` in raw string literals. Fixes (#GH93130). Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h index d8079553118287..4d90528f7992e3 100644 --- a/clang/include/clang/Basic/CharInfo.h +++ b/clang/include/clang/Basic/CharInfo.h @@ -28,8 +28,7 @@ namespace charinfo { CHAR_LOWER = 0x0040, // a-z CHAR_UNDER = 0x0080, // _ CHAR_PERIOD = 0x0100, // . - CHAR_RAWDEL = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"' - CHAR_PUNCT = 0x0400 // `$@() + CHAR_PUNCT = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"'`$@() }; enum { @@ -152,7 +151,8 @@ LLVM_READONLY inline bool isHexDigit(unsigned char c) { /// Note that '_' is both a punctuation character and an identifier character! LLVM_READONLY inline bool isPunctuation(unsigned char c) { using namespace charinfo; - return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0; + return (InfoTable[c] & + (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT | CHAR_PUNCT)) != 0; } /// Return true if this character is an ASCII printable character; that is, a @@ -160,8 +160,8 @@ LLVM_READONLY inline bool isPunctuation(unsigned char c) { /// terminal. LLVM_READONLY inline bool isPrintable(unsigned char c) { using namespace charinfo; - return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT| - CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0; + return (InfoTable[c] & (CHAR_UPPER | CHAR_LOWER | CHAR_PERIOD | CHAR_PUNCT | + CHAR_DIGIT | CHAR_UNDER | CHAR_SPACE)) != 0; } /// Return true if this is the body character of a C preprocessing number, @@ -175,8 +175,9 @@ LLVM_READONLY inline bool isPreprocessingNumberBody(unsigned char c) { /// Return true if this is the body character of a C++ raw string delimiter. LLVM_READONLY inline bool isRawStringDelimBody(unsigned char c) { using namespace charinfo; - return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD| - CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0; + return (InfoTable[c] & (CHAR_UPPER | CHAR_LOWER | CHAR_PERIOD | CHAR_DIGIT | + CHAR_UNDER | CHAR_PUNCT)) != 0 && + c != '(' && c != ')'; } enum class EscapeChar { diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 5a4551a96ca4e7..25fbfe83fa2bcf 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -111,6 +111,14 @@ def warn_cxx98_compat_raw_string_literal : Warning< "raw string literals are incompatible with C++98">, InGroup, DefaultIgnore; +def warn_cxx26_compat_raw_string_literal_character_set : Warning< + " '%0' in a raw string literal delimiter is incompatible " + "with standards before C++2c">, + InGroup, DefaultIgnore; +def ext_cxx26_raw_string_literal_character_set : Extension< + " '%0' in a raw string literal delimiter is a C++2c extension">, + InGroup, DefaultIgnore; + def warn_multichar_character_literal : Warning< "multi-character character constant">, InGroup; def warn_four_char_character_literal : Warning< diff --git a/clang/lib/Basic/CharInfo.cpp b/clang/lib/Basic/CharInfo.cpp index d02054c9718f5f..26d693b8e9b943 100644 --- a/clang/lib/Basic/CharInfo.cpp +++ b/clang/lib/Basic/CharInfo.cpp @@ -31,20 +31,20 @@ const uint16_t clang::charinfo::InfoTable[256] = { 0 , 0 , 0 , 0 , //32 SP 33 ! 34 " 35 # //36 $ 37 % 38 & 39 ' - CHAR_SPACE , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , - CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_SPACE , CHAR_PUNCT , CHAR_PUNCT , CHAR_PUNCT , + CHAR_PUNCT , CHAR_PUNCT , CHAR_PUNCT , CHAR_PUNCT , //40 ( 41 ) 42 * 43 + //44 , 45 - 46 . 47 / - CHAR_PUNCT , CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , - CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , + CHAR_PUNCT , CHAR_PUNCT , CHAR_PUNCT , CHAR_PUNCT , + CHAR_PUNCT , CHAR_PUNCT , CHAR_PERIOD , CHAR_PUNCT , //48 0 49 1 50 2 51 3 //52 4 53 5 54 6 55 7 CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , //56 8 57 9 58 : 59 ; //60 < 61 = 62 > 63 ? - CHAR_DIGIT , CHAR_DIGIT , CHAR_RAWDEL , CHAR_RAWDEL , - CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_DIGIT , CHAR_DIGIT , CHAR_PUNCT , CHAR_PUNCT , + CHAR_PUNCT , CHAR_PUNCT , CHAR_PUNCT , CHAR_PUNCT , //64 @ 65 A 66 B 67 C //68 D 69 E 70 F 71 G CHAR_PUNCT , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , @@ -59,8 +59,8 @@ const uint16_t clang::charinfo::InfoTable[256] = { CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , //88 X 89 Y 90 Z 91 [ //92 \ 93 ] 94 ^ 95 _ - CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_RAWDEL , - CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_PUNCT , + CHAR_PUNCT , CHAR_PUNCT , CHAR_PUNCT , CHAR_UNDER , //96 ` 97 a 98 b 99 c //100 d 101 e 102 f 103 g CHAR_PUNCT , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , @@ -75,6 +75,6 @@ const uint16_t clang::charinfo::InfoTable[256] = { CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , //120 x 121 y 122 z 123 { //124 | 125 } 126 ~ 127 DEL - CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_RAWDEL , - CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_PUNCT , + CHAR_PUNCT , CHAR_PUNCT , CHAR_PUNCT , 0 }; diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index c98645993abe07..c7543a48c0b50e 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -2261,8 +2261,17 @@ bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, unsigned PrefixLen = 0; - while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) + while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) { ++PrefixLen; + if (!isLexingRawMode() && + llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) { + const char *Pos = &CurPtr[PrefixLen]; + Diag(Pos, LangOpts.CPlusPlus26 + ? diag::warn_cxx26_compat_raw_string_literal_character_set + : diag::ext_cxx26_raw_string_literal_character_set) + << StringRef(Pos, 1); + } + } // If the last character was not a '(', then we didn't lex a valid delimiter. if (CurPtr[PrefixLen] != '(') { diff --git a/clang/test/Lexer/cxx2c-raw-strings.cpp b/clang/test/Lexer/cxx2c-raw-strings.cpp new file mode 100644 index 00000000000000..569a4b8447e57d --- /dev/null +++ b/clang/test/Lexer/cxx2c-raw-strings.cpp @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -Wc++26-extensions %s +// RUN: %clang_cc1 -std=c++2c -fsyntax-only -verify=cxx26 -Wpre-c++26-compat %s + +int main() { + (void) R"abc`@$(foobar)abc`@$"; + //expected-warning@-1 {{'`' in a raw string literal delimiter is a C++2c extension}} + //expected-warning@-2 {{'@' in a raw string literal delimiter is a C++2c extension}} + //expected-warning@-3 {{'$' in a raw string literal delimiter is a C++2c extension}} + //cxx26-warning@-4 {{'`' in a raw string literal delimiter is incompatible with standards before C++2c}} + //cxx26-warning@-5 {{'@' in a raw string literal delimiter is incompatible with standards before C++2c}} + //cxx26-warning@-6 {{'$' in a raw string literal delimiter is incompatible with standards before C++2c}} +} From 57790db07c5a70b557d9e0cc88d8cda417b2f30d Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 28 May 2024 13:45:52 +0000 Subject: [PATCH 14/89] [gn build] Port 23e1ed65c2c3 --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index b642b2c82e6d8d..6bd56dd4117b03 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -384,6 +384,9 @@ if (current_toolchain == default_toolchain) { "__concepts/totally_ordered.h", "__condition_variable/condition_variable.h", "__config", + "__configuration/abi.h", + "__configuration/compiler.h", + "__configuration/platform.h", "__coroutine/coroutine_handle.h", "__coroutine/coroutine_traits.h", "__coroutine/noop_coroutine_handle.h", From 46a30dfdfd765021a76c927f70f95024d30786f2 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 23 May 2024 14:09:10 +0000 Subject: [PATCH 15/89] Reland "[AArch64] NFC: Add RUN lines for streaming-compatible code." (#91599) This reverts commit aa9d467abaeb440dc70b64c0f35b8d5e731f3a19. --- ...streaming-mode-fixed-length-and-combine.ll | 83 + ...treaming-mode-fixed-length-bit-counting.ll | 457 +++ ...sve-streaming-mode-fixed-length-bitcast.ll | 97 + ...e-streaming-mode-fixed-length-bitselect.ll | 12 + ...treaming-mode-fixed-length-build-vector.ll | 88 + .../sve-streaming-mode-fixed-length-concat.ll | 228 ++ ...e-streaming-mode-fixed-length-ext-loads.ll | 138 + ...ing-mode-fixed-length-extract-subvector.ll | 136 + ...ng-mode-fixed-length-extract-vector-elt.ll | 53 + ...e-streaming-mode-fixed-length-fcopysign.ll | 177 ++ ...ve-streaming-mode-fixed-length-fp-arith.ll | 989 +++++++ ...streaming-mode-fixed-length-fp-compares.ll | 2486 +++++++++++++++++ ...-streaming-mode-fixed-length-fp-convert.ll | 12 + ...aming-mode-fixed-length-fp-extend-trunc.ll | 278 ++ .../sve-streaming-mode-fixed-length-fp-fma.ll | 116 + ...e-streaming-mode-fixed-length-fp-minmax.ll | 965 +++++++ ...eaming-mode-fixed-length-fp-reduce-fa64.ll | 25 + ...e-streaming-mode-fixed-length-fp-reduce.ll | 1058 +++++++ ...streaming-mode-fixed-length-fp-rounding.ll | 547 ++++ ...e-streaming-mode-fixed-length-fp-select.ll | 99 + ...e-streaming-mode-fixed-length-fp-to-int.ll | 925 ++++++ ...-streaming-mode-fixed-length-fp-vselect.ll | 199 ++ ...ing-mode-fixed-length-insert-vector-elt.ll | 172 ++ ...e-streaming-mode-fixed-length-int-arith.ll | 371 +++ ...treaming-mode-fixed-length-int-compares.ll | 154 + ...sve-streaming-mode-fixed-length-int-div.ll | 1145 ++++++++ ...streaming-mode-fixed-length-int-extends.ll | 763 +++++ ...eaming-mode-fixed-length-int-immediates.ll | 546 ++++ ...sve-streaming-mode-fixed-length-int-log.ll | 229 ++ ...-streaming-mode-fixed-length-int-minmax.ll | 325 +++ ...ing-mode-fixed-length-int-mla-neon-fa64.ll | 7 + ...ve-streaming-mode-fixed-length-int-mulh.ll | 291 ++ ...-streaming-mode-fixed-length-int-reduce.ll | 415 +++ ...sve-streaming-mode-fixed-length-int-rem.ll | 1631 +++++++++++ ...-streaming-mode-fixed-length-int-select.ll | 137 + ...-streaming-mode-fixed-length-int-shifts.ll | 313 +++ ...e-streaming-mode-fixed-length-int-to-fp.ll | 822 ++++++ ...streaming-mode-fixed-length-int-vselect.ll | 123 + ...reaming-mode-fixed-length-limit-duplane.ll | 27 + .../sve-streaming-mode-fixed-length-loads.ll | 127 + ...-streaming-mode-fixed-length-log-reduce.ll | 436 +++ ...streaming-mode-fixed-length-masked-load.ll | 954 +++++++ ...treaming-mode-fixed-length-masked-store.ll | 774 +++++ ...eaming-mode-fixed-length-optimize-ptrue.ll | 216 ++ ...streaming-mode-fixed-length-permute-rev.ll | 127 + ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 320 +++ .../sve-streaming-mode-fixed-length-ptest.ll | 72 + .../sve-streaming-mode-fixed-length-rev.ll | 159 ++ ...e-streaming-mode-fixed-length-sdiv-pow2.ll | 132 + ...treaming-mode-fixed-length-splat-vector.ll | 182 ++ .../sve-streaming-mode-fixed-length-stores.ll | 136 + ...e-streaming-mode-fixed-length-subvector.ll | 133 + ...treaming-mode-fixed-length-trunc-stores.ll | 38 + .../sve-streaming-mode-fixed-length-trunc.ll | 389 +++ ...eaming-mode-fixed-length-vector-shuffle.ll | 151 + .../sve-streaming-mode-test-register-mov.ll | 21 + 56 files changed, 21006 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll index e843537c10a33a..ed3222529a3bb9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -14,6 +15,12 @@ define <4 x i8> @vls_sve_and_4xi8(<4 x i8> %b) nounwind { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_4xi8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0xff000000ff0000 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %c = and <4 x i8> %b, ret <4 x i8> %c } @@ -27,6 +34,12 @@ define <8 x i8> @vls_sve_and_8xi8(<8 x i8> %b) nounwind { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_8xi8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff00 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %c = and <8 x i8> %b, ret <8 x i8> %c } @@ -40,6 +53,12 @@ define <16 x i8> @vls_sve_and_16xi8(<16 x i8> %b) nounwind { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_16xi8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v1.2d, #0xff00ff00ff00ff00 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %c = and <16 x i8> %b, ret <16 x i8> %c } @@ -56,6 +75,13 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_32xi8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v2.2d, #0xff00ff00ff00ff00 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %b = and <32 x i8> %ap, ret <32 x i8> %b @@ -73,6 +99,13 @@ define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_2xi16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov v0.s[0], wzr +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %c = and <2 x i16> %b, ret <2 x i16> %c } @@ -86,6 +119,12 @@ define <4 x i16> @vls_sve_and_4xi16(<4 x i16> %b) nounwind { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_4xi16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0xffff0000ffff0000 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %c = and <4 x i16> %b, ret <4 x i16> %c } @@ -99,6 +138,12 @@ define <8 x i16> @vls_sve_and_8xi16(<8 x i16> %b) nounwind { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_8xi16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v1.2d, #0xffff0000ffff0000 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %c = and <8 x i16> %b, ret <8 x i16> %c } @@ -115,6 +160,13 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_16xi16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v2.2d, #0xffff0000ffff0000 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %c = and <16 x i16> %b, ret <16 x i16> %c } @@ -128,6 +180,13 @@ define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_2xi32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov v0.s[0], wzr +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %c = and <2 x i32> %b, ret <2 x i32> %c } @@ -141,6 +200,12 @@ define <4 x i32> @vls_sve_and_4xi32(<4 x i32> %b) nounwind { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_4xi32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v1.2d, #0xffffffff00000000 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %c = and <4 x i32> %b, ret <4 x i32> %c } @@ -157,6 +222,13 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_8xi32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v2.2d, #0xffffffff00000000 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %c = and <8 x i32> %b, ret <8 x i32> %c } @@ -170,6 +242,11 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_2xi64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov v0.d[0], xzr +; NONEON-NOSVE-NEXT: ret %c = and <2 x i64> %b, ret <2 x i64> %c } @@ -185,6 +262,12 @@ define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: vls_sve_and_4xi64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov v0.d[0], xzr +; NONEON-NOSVE-NEXT: mov v1.d[0], xzr +; NONEON-NOSVE-NEXT: ret %c = and <4 x i64> %b, ret <4 x i64> %c } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll index aa42d5c2a8c132..cd6c2b489efe4c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -18,6 +19,16 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) { ; CHECK-NEXT: sub z0.h, z0.h, #8 // =0x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff +; NONEON-NOSVE-NEXT: mov w8, #8 // =0x8 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: dup v1.4h, w8 +; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h +; NONEON-NOSVE-NEXT: sub v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %op) ret <4 x i8> %res } @@ -30,6 +41,11 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) { ; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: clz v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op) ret <8 x i8> %res } @@ -42,6 +58,11 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) { ; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: clz v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op) ret <16 x i8> %res } @@ -55,6 +76,14 @@ define void @ctlz_v32i8(ptr %a) { ; CHECK-NEXT: clz z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: clz v0.16b, v0.16b +; NONEON-NOSVE-NEXT: clz v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op) store <32 x i8> %res, ptr %a @@ -71,6 +100,16 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) { ; CHECK-NEXT: sub z0.s, z0.s, #16 // =0x10 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff +; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: dup v1.2s, w8 +; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s +; NONEON-NOSVE-NEXT: sub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %op) ret <2 x i16> %res } @@ -83,6 +122,11 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) { ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op) ret <4 x i16> %res } @@ -95,6 +139,11 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) { ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op) ret <8 x i16> %res } @@ -108,6 +157,14 @@ define void @ctlz_v16i16(ptr %a) { ; CHECK-NEXT: clz z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h +; NONEON-NOSVE-NEXT: clz v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op) store <16 x i16> %res, ptr %a @@ -122,6 +179,11 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) { ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op) ret <2 x i32> %res } @@ -134,6 +196,11 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) { ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op) ret <4 x i32> %res } @@ -147,6 +214,14 @@ define void @ctlz_v8i32(ptr %a) { ; CHECK-NEXT: clz z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: clz v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op) store <8 x i32> %res, ptr %a @@ -161,6 +236,27 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) { ; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushr d1, d0, #1 +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ushr d1, d0, #2 +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ushr d1, d0, #4 +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ushr d1, d0, #8 +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ushr d1, d0, #16 +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ushr d1, d0, #32 +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: mvn v0.8b, v0.8b +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h +; NONEON-NOSVE-NEXT: uaddlp v0.1d, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op) ret <1 x i64> %res } @@ -173,6 +269,27 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) { ; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #1 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #2 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #4 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #8 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #16 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #32 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op) ret <2 x i64> %res } @@ -186,6 +303,46 @@ define void @ctlz_v4i64(ptr %a) { ; CHECK-NEXT: clz z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctlz_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #1 +; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #1 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b +; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #2 +; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #2 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b +; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #4 +; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #4 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b +; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #8 +; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #8 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b +; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #16 +; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #16 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b +; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #32 +; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #32 +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b +; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b +; NONEON-NOSVE-NEXT: mvn v1.16b, v1.16b +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b +; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h +; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: uaddlp v1.2d, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op) store <4 x i64> %res, ptr %a @@ -205,6 +362,14 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) { ; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op) ret <4 x i8> %res } @@ -217,6 +382,11 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) { ; CHECK-NEXT: cnt z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op) ret <8 x i8> %res } @@ -229,6 +399,11 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) { ; CHECK-NEXT: cnt z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op) ret <16 x i8> %res } @@ -242,6 +417,14 @@ define void @ctpop_v32i8(ptr %a) { ; CHECK-NEXT: cnt z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op) store <32 x i8> %res, ptr %a @@ -257,6 +440,15 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) { ; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h +; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %op) ret <2 x i16> %res } @@ -269,6 +461,12 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) { ; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op) ret <4 x i16> %res } @@ -281,6 +479,12 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) { ; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op) ret <8 x i16> %res } @@ -294,6 +498,16 @@ define void @ctpop_v16i16(ptr %a) { ; CHECK-NEXT: cnt z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op) store <16 x i16> %res, ptr %a @@ -308,6 +522,13 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) { ; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op) ret <2 x i32> %res } @@ -320,6 +541,13 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) { ; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op) ret <4 x i32> %res } @@ -333,6 +561,18 @@ define void @ctpop_v8i32(ptr %a) { ; CHECK-NEXT: cnt z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b +; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op) store <8 x i32> %res, ptr %a @@ -347,6 +587,14 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) { ; CHECK-NEXT: cnt z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h +; NONEON-NOSVE-NEXT: uaddlp v0.1d, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op) ret <1 x i64> %res } @@ -359,6 +607,14 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) { ; CHECK-NEXT: cnt z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op) ret <2 x i64> %res } @@ -372,6 +628,20 @@ define void @ctpop_v4i64(ptr %a) { ; CHECK-NEXT: cnt z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ctpop_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b +; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h +; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: uaddlp v1.2d, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op) store <4 x i64> %res, ptr %a @@ -392,6 +662,21 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) { ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #256 // =0x100 +; NONEON-NOSVE-NEXT: dup v1.4h, w8 +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: dup v2.4h, w8 +; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub v1.4h, v0.4h, v2.4h +; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b +; NONEON-NOSVE-NEXT: dup v1.4h, w8 +; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h +; NONEON-NOSVE-NEXT: sub v0.4h, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %op) ret <4 x i8> %res } @@ -405,6 +690,14 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) { ; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v1.8b, #1 +; NONEON-NOSVE-NEXT: sub v1.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op) ret <8 x i8> %res } @@ -418,6 +711,14 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) { ; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v1.16b, #1 +; NONEON-NOSVE-NEXT: sub v1.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op) ret <16 x i8> %res } @@ -433,6 +734,19 @@ define void @cttz_v32i8(ptr %a) { ; CHECK-NEXT: clz z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #1 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub v3.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: sub v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op) store <32 x i8> %res, ptr %a @@ -449,6 +763,21 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) { ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #65536 // =0x10000 +; NONEON-NOSVE-NEXT: dup v1.2s, w8 +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: dup v2.2s, w8 +; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub v1.2s, v0.2s, v2.2s +; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b +; NONEON-NOSVE-NEXT: dup v1.2s, w8 +; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s +; NONEON-NOSVE-NEXT: sub v0.2s, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %op) ret <2 x i16> %res } @@ -462,6 +791,18 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) { ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: dup v1.4h, w8 +; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 +; NONEON-NOSVE-NEXT: sub v1.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b +; NONEON-NOSVE-NEXT: dup v1.4h, w8 +; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h +; NONEON-NOSVE-NEXT: sub v0.4h, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op) ret <4 x i16> %res } @@ -475,6 +816,18 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) { ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: dup v1.8h, w8 +; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 +; NONEON-NOSVE-NEXT: sub v1.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: dup v1.8h, w8 +; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h +; NONEON-NOSVE-NEXT: sub v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op) ret <8 x i16> %res } @@ -490,6 +843,24 @@ define void @cttz_v16i16(ptr %a) { ; CHECK-NEXT: clz z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 +; NONEON-NOSVE-NEXT: sub v3.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: sub v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: dup v2.8h, w8 +; NONEON-NOSVE-NEXT: clz v1.8h, v1.8h +; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h +; NONEON-NOSVE-NEXT: sub v1.8h, v2.8h, v1.8h +; NONEON-NOSVE-NEXT: sub v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op) store <16 x i16> %res, ptr %a @@ -505,6 +876,18 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) { ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: dup v1.2s, w8 +; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 +; NONEON-NOSVE-NEXT: sub v1.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b +; NONEON-NOSVE-NEXT: dup v1.2s, w8 +; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s +; NONEON-NOSVE-NEXT: sub v0.2s, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op) ret <2 x i32> %res } @@ -518,6 +901,18 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) { ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: dup v1.4s, w8 +; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 +; NONEON-NOSVE-NEXT: sub v1.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: dup v1.4s, w8 +; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op) ret <4 x i32> %res } @@ -533,6 +928,24 @@ define void @cttz_v8i32(ptr %a) { ; CHECK-NEXT: clz z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 +; NONEON-NOSVE-NEXT: sub v3.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: sub v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: dup v2.4s, w8 +; NONEON-NOSVE-NEXT: clz v1.4s, v1.4s +; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub v1.4s, v2.4s, v1.4s +; NONEON-NOSVE-NEXT: sub v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op) store <8 x i32> %res, ptr %a @@ -548,6 +961,18 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) { ; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: sub d1, d0, d1 +; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h +; NONEON-NOSVE-NEXT: uaddlp v0.1d, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op) ret <1 x i64> %res } @@ -561,6 +986,18 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) { ; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: dup v1.2d, x8 +; NONEON-NOSVE-NEXT: sub v1.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op) ret <2 x i64> %res } @@ -576,6 +1013,26 @@ define void @cttz_v4i64(ptr %a) { ; CHECK-NEXT: clz z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: cttz_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: sub v3.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: sub v0.2d, v2.2d, v0.2d +; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b +; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b +; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h +; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: uaddlp v1.2d, v1.4s +; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op) store <4 x i64> %res, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll index 260ad16581f139..7e93ee99ed7494 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -11,6 +12,12 @@ define void @bitcast_v4i8(ptr %a, ptr %b) { ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: st1b { z0.h }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <4 x i8>, ptr %a %cast = bitcast <4 x i8> %load to <4 x i8> store volatile <4 x i8> %cast, ptr %b @@ -23,6 +30,12 @@ define void @bitcast_v8i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <8 x i8>, ptr %a %cast = bitcast <8 x i8> %load to <8 x i8> store volatile <8 x i8> %cast, ptr %b @@ -35,6 +48,12 @@ define void @bitcast_v16i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <16 x i8>, ptr %a %cast = bitcast <16 x i8> %load to <16 x i8> store volatile <16 x i8> %cast, ptr %b @@ -49,6 +68,14 @@ define void @bitcast_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: str q1, [x1, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <32 x i8>, ptr %a %cast = bitcast <32 x i8> %load to <32 x i8> store volatile <32 x i8> %cast, ptr %b @@ -72,6 +99,16 @@ define void @bitcast_v2i16(ptr %a, ptr %b) { ; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldrh w8, [x0] +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: add x8, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] +; NONEON-NOSVE-NEXT: uzp1 v0.4h, v0.4h, v0.4h +; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <2 x i16>, ptr %a %cast = bitcast <2 x i16> %load to <2 x half> store volatile <2 x half> %cast, ptr %b @@ -84,6 +121,12 @@ define void @bitcast_v4i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <4 x i16>, ptr %a %cast = bitcast <4 x i16> %load to <4 x half> store volatile <4 x half> %cast, ptr %b @@ -96,6 +139,12 @@ define void @bitcast_v8i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <8 x i16>, ptr %a %cast = bitcast <8 x i16> %load to <8 x half> store volatile <8 x half> %cast, ptr %b @@ -110,6 +159,14 @@ define void @bitcast_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: str q1, [x1, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <16 x i16>, ptr %a %cast = bitcast <16 x i16> %load to <16 x half> store volatile <16 x half> %cast, ptr %b @@ -122,6 +179,12 @@ define void @bitcast_v2i32(ptr %a, ptr %b) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <2 x i32>, ptr %a %cast = bitcast <2 x i32> %load to <2 x float> store volatile <2 x float> %cast, ptr %b @@ -134,6 +197,12 @@ define void @bitcast_v4i32(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <4 x i32>, ptr %a %cast = bitcast <4 x i32> %load to <4 x float> store volatile <4 x float> %cast, ptr %b @@ -148,6 +217,14 @@ define void @bitcast_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: str q1, [x1, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <8 x i32>, ptr %a %cast = bitcast <8 x i32> %load to <8 x float> store volatile <8 x float> %cast, ptr %b @@ -160,6 +237,12 @@ define void @bitcast_v1i64(ptr %a, ptr %b) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <1 x i64>, ptr %a %cast = bitcast <1 x i64> %load to <1 x double> store volatile <1 x double> %cast, ptr %b @@ -172,6 +255,12 @@ define void @bitcast_v2i64(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <2 x i64>, ptr %a %cast = bitcast <2 x i64> %load to <2 x double> store volatile <2 x double> %cast, ptr %b @@ -186,6 +275,14 @@ define void @bitcast_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: str q1, [x1, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitcast_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %load = load volatile <4 x i64>, ptr %a %cast = bitcast <4 x i64> %load to <4 x double> store volatile <4 x double> %cast, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll index 9a07bd8bd5ac9f..6b8077053b590f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64" @@ -30,6 +31,17 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x2] +; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s +; NONEON-NOSVE-NEXT: neg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: ret %pre_cond = load <8 x i32>, ptr %pre_cond_ptr %left = load <8 x i32>, ptr %left_ptr %right = load <8 x i32>, ptr %right_ptr diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll index aec434b4819d70..318a9cf7d738b2 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -10,6 +11,12 @@ define void @build_vector_7_inc1_v4i1(ptr %a) { ; CHECK-NEXT: mov w8, #5 // =0x5 ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_7_inc1_v4i1: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: strb w8, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x i1> , ptr %a, align 1 ret void } @@ -23,6 +30,15 @@ define void @build_vector_7_inc1_v32i8(ptr %a) { ; CHECK-NEXT: add z1.b, z1.b, #23 // =0x17 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_7_inc1_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI1_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI1_1 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI1_0] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI1_1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <32 x i8> , ptr %a, align 1 ret void } @@ -35,6 +51,15 @@ define void @build_vector_0_inc2_v16i16(ptr %a) { ; CHECK-NEXT: add z0.h, z0.h, #16 // =0x10 ; CHECK-NEXT: str q0, [x0, #16] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI2_1 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI2_1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <16 x i16> , ptr %a, align 2 ret void } @@ -48,6 +73,15 @@ define void @build_vector_0_dec3_v8i32(ptr %a) { ; CHECK-NEXT: add z1.s, z0.s, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI3_1 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI3_1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <8 x i32> , ptr %a, align 4 ret void } @@ -64,6 +98,15 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) { ; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI4_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI4_1 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI4_0] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI4_1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x i64> , ptr %a, align 8 ret void } @@ -76,6 +119,15 @@ define void @build_vector_no_stride_v4i64(ptr %a) { ; CHECK-NEXT: index z1.d, #0, #4 ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_no_stride_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI5_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI5_1 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI5_0] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI5_1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x i64> , ptr %a, align 8 ret void } @@ -89,6 +141,15 @@ define void @build_vector_0_inc2_v16f16(ptr %a) { ; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI6_1] ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI6_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI6_1 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI6_0] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI6_1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <16 x half> , ptr %a, align 2 ret void } @@ -103,6 +164,15 @@ define void @build_vector_0_dec3_v8f32(ptr %a) { ; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI7_1] ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI7_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI7_1 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI7_0] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI7_1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <8 x float> , ptr %a, align 4 ret void } @@ -117,6 +187,15 @@ define void @build_vector_minus2_dec32_v4f64(ptr %a) { ; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI8_1] ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI8_1 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI8_0] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI8_1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x double> , ptr %a, align 8 ret void } @@ -131,6 +210,15 @@ define void @build_vector_no_stride_v4f64(ptr %a) { ; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI9_1] ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_no_stride_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI9_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI9_1 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI9_0] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI9_1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x double> , ptr %a, align 8 ret void } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll index 82e75d6efda352..d2bfc7d4e80969 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -40,6 +41,11 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> ret <8 x i8> %res } @@ -53,6 +59,13 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> ret <16 x i8> %res @@ -65,6 +78,13 @@ define void @concat_v32i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i8>, ptr %a %op2 = load <16 x i8>, ptr %b %res = shufflevector <16 x i8> %op1, <16 x i8> %op2, <32 x i32> , ptr %a %op2 = load <32 x i8>, ptr %b %res = shufflevector <32 x i8> %op1, <32 x i8> %op2, <64 x i32> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> ret <4 x i16> %res } @@ -135,6 +168,13 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> ret <8 x i16> %res } @@ -146,6 +186,13 @@ define void @concat_v16i16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %op2 = load <8 x i16>, ptr %b %res = shufflevector <8 x i16> %op1, <8 x i16> %op2, <16 x i32> , ptr %a %op2 = load <16 x i16>, ptr %b %res = shufflevector <16 x i16> %op1, <16 x i16> %op2, <32 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) { ; CHECK-NEXT: zip1 z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> ret <2 x i32> %res } @@ -199,6 +259,13 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> ret <4 x i32> %res } @@ -210,6 +277,13 @@ define void @concat_v8i32(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %op2 = load <4 x i32>, ptr %b %res = shufflevector <4 x i32> %op1, <4 x i32> %op2, <8 x i32> @@ -225,6 +299,14 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: stp q0, q1, [x2, #32] ; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2, #32] +; NONEON-NOSVE-NEXT: stp q3, q2, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = shufflevector <8 x i32> %op1, <8 x i32> %op2, <16 x i32> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> ret <2 x i64> %res } @@ -258,6 +347,13 @@ define void @concat_v4i64(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b %res = shufflevector <2 x i64> %op1, <2 x i64> %op2, <4 x i32> @@ -273,6 +369,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: stp q0, q1, [x2, #32] ; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2, #32] +; NONEON-NOSVE-NEXT: stp q3, q2, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = shufflevector <4 x i64> %op1, <4 x i64> %op2, <8 x i32> @@ -300,6 +404,11 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> ret <4 x half> %res } @@ -313,6 +422,13 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> ret <8 x half> %res } @@ -324,6 +440,13 @@ define void @concat_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %op2 = load <8 x half>, ptr %b %res = shufflevector <8 x half> %op1, <8 x half> %op2, <16 x i32> , ptr %a %op2 = load <16 x half>, ptr %b %res = shufflevector <16 x half> %op1, <16 x half> %op2, <32 x i32> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) { ; CHECK-NEXT: zip1 z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> ret <2 x float> %res } @@ -377,6 +513,13 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> ret <4 x float> %res } @@ -388,6 +531,13 @@ define void @concat_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %op2 = load <4 x float>, ptr %b %res = shufflevector <4 x float> %op1, <4 x float> %op2, <8 x i32> @@ -403,6 +553,14 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: stp q0, q1, [x2, #32] ; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v16f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2, #32] +; NONEON-NOSVE-NEXT: stp q3, q2, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = shufflevector <8 x float> %op1, <8 x float> %op2, <16 x i32> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) { ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> ret <2 x double> %res } @@ -436,6 +601,13 @@ define void @concat_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x double>, ptr %a %op2 = load <2 x double>, ptr %b %res = shufflevector <2 x double> %op1, <2 x double> %op2, <4 x i32> @@ -451,6 +623,14 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: stp q0, q1, [x2, #32] ; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v8f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2, #32] +; NONEON-NOSVE-NEXT: stp q3, q2, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = shufflevector <4 x double> %op1, <4 x double> %op2, <8 x i32> @@ -468,6 +648,12 @@ define void @concat_v32i8_undef(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v32i8_undef: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i8>, ptr %a %res = shufflevector <16 x i8> %op1, <16 x i8> undef, <32 x i32> , ptr %a %res = shufflevector <8 x i16> %op1, <8 x i16> undef, <16 x i32> @@ -496,6 +688,12 @@ define void @concat_v8i32_undef(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v8i32_undef: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = shufflevector <4 x i32> %op1, <4 x i32> undef, <8 x i32> store <8 x i32> %res, ptr %b @@ -508,6 +706,12 @@ define void @concat_v4i64_undef(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v4i64_undef: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %res = shufflevector <2 x i64> %op1, <2 x i64> undef, <4 x i32> store <4 x i64> %res, ptr %b @@ -524,6 +728,12 @@ define void @concat_v32i8_4op(ptr %a, ptr %b) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v32i8_4op: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i8>, ptr %a %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> @@ -541,6 +751,12 @@ define void @concat_v16i16_4op(ptr %a, ptr %b) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v16i16_4op: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> %res = shufflevector <8 x i16> %shuffle, <8 x i16> undef, <16 x i32> , ptr %a %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> %res = shufflevector <4 x i32> %shuffle, <4 x i32> undef, <8 x i32> @@ -568,6 +790,12 @@ define void @concat_v4i64_4op(ptr %a, ptr %b) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: concat_v4i64_4op: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <1 x i64>, ptr %a %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> %res = shufflevector <2 x i64> %shuffle, <2 x i64> undef, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 040e5861e98101..728b85d39bb37f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -11,6 +12,12 @@ define <8 x i16> @load_zext_v8i8i16(ptr %ap) { ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_zext_v8i8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: ret %a = load <8 x i8>, ptr %ap %val = zext <8 x i8> %a to <8 x i16> ret <8 x i16> %val @@ -23,6 +30,12 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap) { ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_zext_v4i16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ret %a = load <4 x i16>, ptr %ap %val = zext <4 x i16> %a to <4 x i32> ret <4 x i32> %val @@ -35,6 +48,12 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) { ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_zext_v2i32i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ret %a = load <2 x i32>, ptr %ap %val = zext <2 x i32> %a to <2 x i64> ret <2 x i64> %val @@ -54,6 +73,19 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) { ; CHECK-NEXT: mov x7, xzr ; CHECK-NEXT: fmov x4, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_zext_v2i64i256: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: mov x1, xzr +; NONEON-NOSVE-NEXT: mov x2, xzr +; NONEON-NOSVE-NEXT: mov x3, xzr +; NONEON-NOSVE-NEXT: mov x5, xzr +; NONEON-NOSVE-NEXT: mov x6, xzr +; NONEON-NOSVE-NEXT: mov x4, v0.d[1] +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: mov x7, xzr +; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %ap %val = zext <2 x i64> %a to <2 x i256> ret <2 x i256> %val @@ -75,6 +107,24 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap) { ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 ; CHECK-NEXT: // kill: def $q3 killed $q3 killed $z3 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_sext_v16i8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: sshll v1.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v2.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: sshll v0.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #16] +; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ldr d3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v1.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: sshll v3.4s, v4.4h, #0 +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %a = load <16 x i8>, ptr %ap %val = sext <16 x i8> %a to <16 x i32> ret <16 x i32> %val @@ -90,6 +140,17 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_sext_v8i16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %a = load <8 x i16>, ptr %ap %val = sext <8 x i16> %a to <8 x i32> ret <8 x i32> %val @@ -121,6 +182,39 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) { ; CHECK-NEXT: stp x12, x12, [x8, #112] ; CHECK-NEXT: stp x11, x12, [x8, #96] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_sext_v4i32i256: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: add x10, x8, #32 +; NONEON-NOSVE-NEXT: add x11, x8, #96 +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: mov x9, v0.d[1] +; NONEON-NOSVE-NEXT: st1 { v0.d }[1], [x10] +; NONEON-NOSVE-NEXT: fmov x10, d0 +; NONEON-NOSVE-NEXT: st1 { v1.d }[1], [x11] +; NONEON-NOSVE-NEXT: mov x11, v1.d[1] +; NONEON-NOSVE-NEXT: asr x10, x10, #63 +; NONEON-NOSVE-NEXT: str d0, [x8] +; NONEON-NOSVE-NEXT: asr x9, x9, #63 +; NONEON-NOSVE-NEXT: str d1, [x8, #64] +; NONEON-NOSVE-NEXT: stp x10, x10, [x8, #16] +; NONEON-NOSVE-NEXT: stp x9, x9, [x8, #48] +; NONEON-NOSVE-NEXT: str x9, [x8, #40] +; NONEON-NOSVE-NEXT: fmov x9, d1 +; NONEON-NOSVE-NEXT: str x10, [x8, #8] +; NONEON-NOSVE-NEXT: asr x10, x11, #63 +; NONEON-NOSVE-NEXT: asr x9, x9, #63 +; NONEON-NOSVE-NEXT: stp x10, x10, [x8, #112] +; NONEON-NOSVE-NEXT: str x10, [x8, #104] +; NONEON-NOSVE-NEXT: stp x9, x9, [x8, #80] +; NONEON-NOSVE-NEXT: str x9, [x8, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %ap %val = sext <4 x i32> %a to <4 x i256> ret <4 x i256> %val @@ -154,6 +248,22 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) { ; CHECK-NEXT: fmov x1, d6 ; CHECK-NEXT: fmov x5, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_sext_v2i64i256: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: mov x8, v0.d[1] +; NONEON-NOSVE-NEXT: dup v1.2d, v0.d[1] +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: asr x1, x0, #63 +; NONEON-NOSVE-NEXT: asr x5, x8, #63 +; NONEON-NOSVE-NEXT: mov x2, x1 +; NONEON-NOSVE-NEXT: mov x3, x1 +; NONEON-NOSVE-NEXT: mov v1.d[1], x5 +; NONEON-NOSVE-NEXT: mov x6, x5 +; NONEON-NOSVE-NEXT: mov x7, x5 +; NONEON-NOSVE-NEXT: fmov x4, d1 +; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %ap %val = sext <2 x i64> %a to <2 x i256> ret <2 x i256> %val @@ -187,6 +297,34 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap) { ; CHECK-NEXT: // kill: def $q6 killed $q6 killed $z6 ; CHECK-NEXT: // kill: def $q7 killed $q7 killed $z7 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_zext_v16i16i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ushll v2.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v3.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v4.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: ushll v5.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v0.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #32] +; NONEON-NOSVE-NEXT: ushll v2.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d6, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] +; NONEON-NOSVE-NEXT: stp q5, q3, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d16, [sp, #88] +; NONEON-NOSVE-NEXT: ldr d17, [sp, #72] +; NONEON-NOSVE-NEXT: ushll v1.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: ushll v3.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: ushll v6.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: ushll v5.2d, v16.2s, #0 +; NONEON-NOSVE-NEXT: ushll v7.2d, v17.2s, #0 +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %ap %val = zext <16 x i16> %a to <16 x i64> ret <16 x i64> %val diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll index 45a804becbc557..ec6341d6085a0a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -27,6 +28,11 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v8i1: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %ret = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> %op, i64 4) ret <4 x i1> %ret } @@ -54,6 +60,11 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4) ret <4 x i8> %ret } @@ -65,6 +76,14 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %ret = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> %op, i64 8) ret <8 x i8> %ret } @@ -75,6 +94,12 @@ define void @extract_subvector_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %ret = call <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8> %op, i64 16) store <16 x i8> %ret, ptr %b @@ -91,6 +116,15 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2) ret <2 x i16> %ret } @@ -102,6 +136,14 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %ret = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %op, i64 4) ret <4 x i16> %ret } @@ -112,6 +154,12 @@ define void @extract_subvector_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %ret = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> %op, i64 8) store <8 x i16> %ret, ptr %b @@ -127,6 +175,12 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) { ; CHECK-NEXT: mov z0.s, z0.s[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[1] +; NONEON-NOSVE-NEXT: ret %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1) ret <1 x i32> %ret } @@ -138,6 +192,14 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %ret = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %op, i64 2) ret <2 x i32> %ret } @@ -148,6 +210,12 @@ define void @extract_subvector_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %ret = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %op, i64 4) store <4 x i32> %ret, ptr %b @@ -163,6 +231,14 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %ret = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %op, i64 1) ret <1 x i64> %ret } @@ -173,6 +249,12 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %ret = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> %op, i64 2) store <2 x i64> %ret, ptr %b @@ -190,6 +272,12 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) { ; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[1] +; NONEON-NOSVE-NEXT: ret %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2) ret <2 x half> %ret } @@ -201,6 +289,14 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %ret = call <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half> %op, i64 4) ret <4 x half> %ret } @@ -211,6 +307,12 @@ define void @extract_subvector_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %ret = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> %op, i64 8) store <8 x half> %ret, ptr %b @@ -226,6 +328,12 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) { ; CHECK-NEXT: mov z0.s, z0.s[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[1] +; NONEON-NOSVE-NEXT: ret %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1) ret <1 x float> %ret } @@ -237,6 +345,14 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %ret = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> %op, i64 2) ret <2 x float> %ret } @@ -247,6 +363,12 @@ define void @extract_subvector_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %ret = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> %op, i64 4) store <4 x float> %ret, ptr %b @@ -262,6 +384,14 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %ret = call <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double> %op, i64 1) ret <1 x double> %ret } @@ -272,6 +402,12 @@ define void @extract_subvector_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extract_subvector_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %ret = call <2 x double> @llvm.vector.extract.v2f64.v4f64(<4 x double> %op, i64 2) store <2 x double> %ret, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll index 9c3b5e14289dc1..ac60a614d7ce6c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -15,6 +16,12 @@ define half @extractelement_v2f16(<2 x half> %op1) { ; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h0, v0.h[1] +; NONEON-NOSVE-NEXT: ret %r = extractelement <2 x half> %op1, i64 1 ret half %r } @@ -26,6 +33,12 @@ define half @extractelement_v4f16(<4 x half> %op1) { ; CHECK-NEXT: mov z0.h, z0.h[3] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h0, v0.h[3] +; NONEON-NOSVE-NEXT: ret %r = extractelement <4 x half> %op1, i64 3 ret half %r } @@ -37,6 +50,11 @@ define half @extractelement_v8f16(<8 x half> %op1) { ; CHECK-NEXT: mov z0.h, z0.h[7] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: ret %r = extractelement <8 x half> %op1, i64 7 ret half %r } @@ -48,6 +66,11 @@ define half @extractelement_v16f16(ptr %a) { ; CHECK-NEXT: mov z0.h, z0.h[7] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr h0, [x0, #30] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %r = extractelement <16 x half> %op1, i64 15 ret half %r @@ -60,6 +83,12 @@ define float @extractelement_v2f32(<2 x float> %op1) { ; CHECK-NEXT: mov z0.s, z0.s[1] ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov s0, v0.s[1] +; NONEON-NOSVE-NEXT: ret %r = extractelement <2 x float> %op1, i64 1 ret float %r } @@ -71,6 +100,11 @@ define float @extractelement_v4f32(<4 x float> %op1) { ; CHECK-NEXT: mov z0.s, z0.s[3] ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov s0, v0.s[3] +; NONEON-NOSVE-NEXT: ret %r = extractelement <4 x float> %op1, i64 3 ret float %r } @@ -82,6 +116,11 @@ define float @extractelement_v8f32(ptr %a) { ; CHECK-NEXT: mov z0.s, z0.s[3] ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr s0, [x0, #28] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %r = extractelement <8 x float> %op1, i64 7 ret float %r @@ -91,6 +130,10 @@ define double @extractelement_v1f64(<1 x double> %op1) { ; CHECK-LABEL: extractelement_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ret %r = extractelement <1 x double> %op1, i64 0 ret double %r } @@ -101,6 +144,11 @@ define double @extractelement_v2f64(<2 x double> %op1) { ; CHECK-NEXT: mov z0.d, z0.d[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov d0, v0.d[1] +; NONEON-NOSVE-NEXT: ret %r = extractelement <2 x double> %op1, i64 1 ret double %r } @@ -112,6 +160,11 @@ define double @extractelement_v4f64(ptr %a) { ; CHECK-NEXT: mov z0.d, z0.d[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extractelement_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0, #24] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %r = extractelement <4 x double> %op1, i64 3 ret double %r diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index 21ce689f68e23a..c1d84f6a15ed8c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -2,6 +2,7 @@ ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -28,6 +29,16 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z1.d, z1.d, z2.d, z0.d ; SVE2-NEXT: str d1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: ldr d2, [x1] +; NONEON-NOSVE-NEXT: dup v0.4h, w8 +; NONEON-NOSVE-NEXT: bsl v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x half>, ptr %bp %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b) @@ -54,6 +65,16 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z1.d, z1.d, z2.d, z0.d ; SVE2-NEXT: str q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: bsl v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <8 x half>, ptr %ap %b = load <8 x half>, ptr %bp %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) @@ -84,6 +105,17 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z3.d, z3.d, z4.d, z0.d ; SVE2-NEXT: stp q2, q3, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v16f16_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: ldp q1, q4, [x1] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <16 x half>, ptr %ap %b = load <16 x half>, ptr %bp %r = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b) @@ -112,6 +144,16 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z1.d, z1.d, z2.d, z0.d ; SVE2-NEXT: str d1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d0, #0xffffffffffffffff +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: ldr d2, [x1] +; NONEON-NOSVE-NEXT: fneg v0.2s, v0.2s +; NONEON-NOSVE-NEXT: bsl v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <2 x float>, ptr %ap %b = load <2 x float>, ptr %bp %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) @@ -138,6 +180,16 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z1.d, z1.d, z2.d, z0.d ; SVE2-NEXT: str q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: bsl v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <4 x float>, ptr %ap %b = load <4 x float>, ptr %bp %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) @@ -168,6 +220,17 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z3.d, z3.d, z4.d, z0.d ; SVE2-NEXT: stp q2, q3, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v8f32_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff +; NONEON-NOSVE-NEXT: ldp q1, q4, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <8 x float>, ptr %ap %b = load <8 x float>, ptr %bp %r = call <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b) @@ -196,6 +259,16 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z1.d, z1.d, z2.d, z0.d ; SVE2-NEXT: str q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: bsl v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <2 x double>, ptr %ap %b = load <2 x double>, ptr %bp %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) @@ -226,6 +299,17 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z3.d, z3.d, z4.d, z0.d ; SVE2-NEXT: stp q2, q3, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff +; NONEON-NOSVE-NEXT: ldp q1, q4, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x double>, ptr %bp %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) @@ -260,6 +344,17 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d ; SVE2-NEXT: str d2, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d0, #0xffffffffffffffff +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: ldr d2, [x0] +; NONEON-NOSVE-NEXT: fcvtn v1.2s, v1.2d +; NONEON-NOSVE-NEXT: fneg v0.2s, v0.2s +; NONEON-NOSVE-NEXT: bsl v0.8b, v2.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <2 x float>, ptr %ap %b = load <2 x double>, ptr %bp %tmp0 = fptrunc <2 x double> %b to <2 x float> @@ -304,6 +399,18 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d ; SVE2-NEXT: str q2, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] +; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff +; NONEON-NOSVE-NEXT: fcvtn v1.2s, v1.2d +; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.4s, v2.2d +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <4 x float>, ptr %ap %b = load <4 x double>, ptr %bp %tmp0 = fptrunc <4 x double> %b to <4 x float> @@ -337,6 +444,17 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d ; SVE2-NEXT: str q2, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff +; NONEON-NOSVE-NEXT: ldr d1, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <2 x double>, ptr %ap %b = load < 2 x float>, ptr %bp %tmp0 = fpext <2 x float> %b to <2 x double> @@ -381,6 +499,23 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z4.d, z4.d, z1.d, z2.d ; SVE2-NEXT: stp q3, q4, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtl v4.2d, v4.2s +; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x float>, ptr %bp %tmp0 = fpext <4 x float> %b to <4 x double> @@ -416,6 +551,17 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d ; SVE2-NEXT: str d2, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: ldr d2, [x0] +; NONEON-NOSVE-NEXT: dup v1.4h, w8 +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: bit v0.8b, v2.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x float>, ptr %bp %tmp0 = fptrunc <4 x float> %b to <4 x half> @@ -471,6 +617,25 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; SVE2-NEXT: str d5, [x0] ; SVE2-NEXT: add sp, sp, #16 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x1] +; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: mov d1, v0.d[1] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: fcvt h1, d1 +; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, d2 +; NONEON-NOSVE-NEXT: mov d2, v2.d[1] +; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, d2 +; NONEON-NOSVE-NEXT: ldr d2, [x0] +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: dup v1.4h, w8 +; NONEON-NOSVE-NEXT: bit v0.8b, v2.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x double>, ptr %bp %tmp0 = fptrunc <4 x double> %b to <4 x half> @@ -514,6 +679,18 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) { ; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d ; SVE2-NEXT: str q2, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: dup v1.8h, w8 +; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %a = load <8 x half>, ptr %ap %b = load <8 x float>, ptr %bp %tmp0 = fptrunc <8 x float> %b to <8 x half> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll index b0a82e699939f1..b51b89d08844d0 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -17,6 +18,14 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) { ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fadd <2 x half> %op1, %op2 ret <2 x half> %res } @@ -30,6 +39,14 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fadd <4 x half> %op1, %op2 ret <4 x half> %res } @@ -43,6 +60,18 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fadd v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: fadd v1.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = fadd <8 x half> %op1, %op2 ret <8 x half> %res } @@ -58,6 +87,29 @@ define void @fadd_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h +; NONEON-NOSVE-NEXT: fadd v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: fadd v5.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fadd v2.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s +; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %res = fadd <16 x half> %op1, %op2 @@ -74,6 +126,11 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fadd v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = fadd <2 x float> %op1, %op2 ret <2 x float> %res } @@ -87,6 +144,11 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) { ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = fadd <4 x float> %op1, %op2 ret <4 x float> %res } @@ -102,6 +164,15 @@ define void @fadd_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fadd v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = fadd <8 x float> %op1, %op2 @@ -118,6 +189,11 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) { ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fadd v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = fadd <2 x double> %op1, %op2 ret <2 x double> %res } @@ -133,6 +209,15 @@ define void @fadd_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fadd v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = fadd <4 x double> %op1, %op2 @@ -153,6 +238,14 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) { ; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fdiv_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fdiv <2 x half> %op1, %op2 ret <2 x half> %res } @@ -166,6 +259,14 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fdiv_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fdiv <4 x half> %op1, %op2 ret <4 x half> %res } @@ -179,6 +280,18 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fdiv_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fdiv v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: fdiv v1.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = fdiv <8 x half> %op1, %op2 ret <8 x half> %res } @@ -194,6 +307,30 @@ define void @fdiv_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fdiv z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fdiv_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q4, q1, [x1] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v5.4s, v4.8h +; NONEON-NOSVE-NEXT: fcvtl v4.4s, v4.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fdiv v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: ldr q3, [x0] +; NONEON-NOSVE-NEXT: fcvtl2 v6.4s, v3.8h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h +; NONEON-NOSVE-NEXT: fdiv v3.4s, v3.4s, v4.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fdiv v5.4s, v6.4s, v5.4s +; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v5.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q2, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %res = fdiv <16 x half> %op1, %op2 @@ -210,6 +347,11 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fdiv_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fdiv v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = fdiv <2 x float> %op1, %op2 ret <2 x float> %res } @@ -223,6 +365,11 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) { ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fdiv_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = fdiv <4 x float> %op1, %op2 ret <4 x float> %res } @@ -238,6 +385,15 @@ define void @fdiv_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fdiv z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fdiv_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fdiv v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fdiv v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = fdiv <8 x float> %op1, %op2 @@ -254,6 +410,11 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) { ; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fdiv_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fdiv v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = fdiv <2 x double> %op1, %op2 ret <2 x double> %res } @@ -269,6 +430,15 @@ define void @fdiv_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fdiv z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fdiv_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fdiv v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fdiv v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = fdiv <4 x double> %op1, %op2 @@ -290,6 +460,46 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3) ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d2 killed $d2 def $q2 +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[1] +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: fcvt s16, h0 +; NONEON-NOSVE-NEXT: mov h17, v2.h[2] +; NONEON-NOSVE-NEXT: mov h18, v1.h[2] +; NONEON-NOSVE-NEXT: mov h19, v0.h[2] +; NONEON-NOSVE-NEXT: mov h2, v2.h[3] +; NONEON-NOSVE-NEXT: mov h1, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fmadd s6, s16, s7, s6 +; NONEON-NOSVE-NEXT: mov h16, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s7, h19 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmadd s3, s5, s4, s3 +; NONEON-NOSVE-NEXT: fcvt s4, h17 +; NONEON-NOSVE-NEXT: fcvt s5, h18 +; NONEON-NOSVE-NEXT: fcvt h0, s6 +; NONEON-NOSVE-NEXT: fmadd s4, s7, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h16 +; NONEON-NOSVE-NEXT: mov v0.h[1], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: fmadd s1, s5, s1, s2 +; NONEON-NOSVE-NEXT: mov v0.h[2], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3) ret <2 x half> %res } @@ -304,6 +514,46 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d2 killed $d2 def $q2 +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[1] +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: fcvt s16, h0 +; NONEON-NOSVE-NEXT: mov h17, v2.h[2] +; NONEON-NOSVE-NEXT: mov h18, v1.h[2] +; NONEON-NOSVE-NEXT: mov h19, v0.h[2] +; NONEON-NOSVE-NEXT: mov h2, v2.h[3] +; NONEON-NOSVE-NEXT: mov h1, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fmadd s6, s16, s7, s6 +; NONEON-NOSVE-NEXT: mov h16, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s7, h19 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmadd s3, s5, s4, s3 +; NONEON-NOSVE-NEXT: fcvt s4, h17 +; NONEON-NOSVE-NEXT: fcvt s5, h18 +; NONEON-NOSVE-NEXT: fcvt h0, s6 +; NONEON-NOSVE-NEXT: fmadd s4, s7, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h16 +; NONEON-NOSVE-NEXT: mov v0.h[1], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: fmadd s1, s5, s1, s2 +; NONEON-NOSVE-NEXT: mov v0.h[2], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ret <4 x half> %res } @@ -318,6 +568,79 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h3, v2.h[1] +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: fcvt s16, h0 +; NONEON-NOSVE-NEXT: mov h17, v2.h[2] +; NONEON-NOSVE-NEXT: mov h18, v1.h[2] +; NONEON-NOSVE-NEXT: mov h19, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fmadd s6, s16, s7, s6 +; NONEON-NOSVE-NEXT: fcvt s7, h17 +; NONEON-NOSVE-NEXT: fcvt s16, h18 +; NONEON-NOSVE-NEXT: fcvt s17, h19 +; NONEON-NOSVE-NEXT: mov h18, v1.h[3] +; NONEON-NOSVE-NEXT: mov h19, v0.h[3] +; NONEON-NOSVE-NEXT: fmadd s4, s5, s4, s3 +; NONEON-NOSVE-NEXT: mov h5, v2.h[3] +; NONEON-NOSVE-NEXT: fcvt h3, s6 +; NONEON-NOSVE-NEXT: fmadd s6, s17, s16, s7 +; NONEON-NOSVE-NEXT: mov h17, v2.h[4] +; NONEON-NOSVE-NEXT: fcvt s7, h18 +; NONEON-NOSVE-NEXT: fcvt s16, h19 +; NONEON-NOSVE-NEXT: mov h18, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: mov h19, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: mov v3.h[1], v4.h[0] +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: fmadd s5, s16, s7, s5 +; NONEON-NOSVE-NEXT: mov h7, v1.h[5] +; NONEON-NOSVE-NEXT: mov h16, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: mov v3.h[2], v6.h[0] +; NONEON-NOSVE-NEXT: mov h6, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: fmadd s17, s19, s18, s17 +; NONEON-NOSVE-NEXT: mov h18, v1.h[6] +; NONEON-NOSVE-NEXT: mov h19, v0.h[6] +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fmadd s4, s16, s7, s4 +; NONEON-NOSVE-NEXT: mov v3.h[3], v5.h[0] +; NONEON-NOSVE-NEXT: fcvt s5, h6 +; NONEON-NOSVE-NEXT: fcvt s6, h18 +; NONEON-NOSVE-NEXT: fcvt s7, h19 +; NONEON-NOSVE-NEXT: fcvt h16, s17 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fmadd s5, s7, s6, s5 +; NONEON-NOSVE-NEXT: mov v3.h[4], v16.h[0] +; NONEON-NOSVE-NEXT: fmadd s0, s0, s1, s2 +; NONEON-NOSVE-NEXT: mov v3.h[5], v4.h[0] +; NONEON-NOSVE-NEXT: fcvt h4, s5 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v3.h[6], v4.h[0] +; NONEON-NOSVE-NEXT: mov v3.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: mov v0.16b, v3.16b +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ret <8 x half> %res } @@ -334,6 +657,150 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: fmla z1.h, p0/m, z3.h, z4.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q3, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q4, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q5, q2, [x2] +; NONEON-NOSVE-NEXT: mov h25, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s19, h0 +; NONEON-NOSVE-NEXT: mov h24, v0.h[2] +; NONEON-NOSVE-NEXT: mov h17, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s18, h1 +; NONEON-NOSVE-NEXT: mov h22, v1.h[2] +; NONEON-NOSVE-NEXT: mov h16, v2.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: mov h20, v2.h[2] +; NONEON-NOSVE-NEXT: mov h26, v5.h[1] +; NONEON-NOSVE-NEXT: mov h27, v4.h[1] +; NONEON-NOSVE-NEXT: mov h28, v3.h[1] +; NONEON-NOSVE-NEXT: fcvt s25, h25 +; NONEON-NOSVE-NEXT: mov h7, v2.h[3] +; NONEON-NOSVE-NEXT: mov h29, v4.h[2] +; NONEON-NOSVE-NEXT: fcvt s23, h17 +; NONEON-NOSVE-NEXT: mov h17, v0.h[3] +; NONEON-NOSVE-NEXT: mov h30, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s21, h16 +; NONEON-NOSVE-NEXT: fmadd s6, s19, s18, s6 +; NONEON-NOSVE-NEXT: fcvt s18, h20 +; NONEON-NOSVE-NEXT: fcvt s19, h22 +; NONEON-NOSVE-NEXT: fcvt s20, h24 +; NONEON-NOSVE-NEXT: mov h16, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s22, h5 +; NONEON-NOSVE-NEXT: fcvt s24, h4 +; NONEON-NOSVE-NEXT: fcvt s26, h26 +; NONEON-NOSVE-NEXT: fcvt s27, h27 +; NONEON-NOSVE-NEXT: fcvt s28, h28 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fmadd s21, s25, s23, s21 +; NONEON-NOSVE-NEXT: fcvt s23, h3 +; NONEON-NOSVE-NEXT: mov h25, v5.h[2] +; NONEON-NOSVE-NEXT: fmadd s18, s20, s19, s18 +; NONEON-NOSVE-NEXT: mov h19, v3.h[2] +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: mov h31, v0.h[4] +; NONEON-NOSVE-NEXT: fmadd s26, s28, s27, s26 +; NONEON-NOSVE-NEXT: mov h27, v4.h[3] +; NONEON-NOSVE-NEXT: mov h28, v3.h[3] +; NONEON-NOSVE-NEXT: fmadd s22, s23, s24, s22 +; NONEON-NOSVE-NEXT: fcvt h20, s21 +; NONEON-NOSVE-NEXT: mov h21, v2.h[4] +; NONEON-NOSVE-NEXT: fcvt s23, h25 +; NONEON-NOSVE-NEXT: fcvt s24, h29 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: fmadd s16, s17, s16, s7 +; NONEON-NOSVE-NEXT: mov h25, v5.h[3] +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: fcvt h26, s26 +; NONEON-NOSVE-NEXT: mov h29, v2.h[5] +; NONEON-NOSVE-NEXT: mov v6.h[1], v20.h[0] +; NONEON-NOSVE-NEXT: fcvt s17, h21 +; NONEON-NOSVE-NEXT: fcvt s20, h30 +; NONEON-NOSVE-NEXT: fmadd s19, s19, s24, s23 +; NONEON-NOSVE-NEXT: fcvt s21, h31 +; NONEON-NOSVE-NEXT: fcvt h7, s22 +; NONEON-NOSVE-NEXT: fcvt s22, h25 +; NONEON-NOSVE-NEXT: fcvt s23, h27 +; NONEON-NOSVE-NEXT: fcvt s24, h28 +; NONEON-NOSVE-NEXT: mov h25, v5.h[4] +; NONEON-NOSVE-NEXT: mov h27, v4.h[4] +; NONEON-NOSVE-NEXT: mov h28, v3.h[4] +; NONEON-NOSVE-NEXT: mov h30, v1.h[5] +; NONEON-NOSVE-NEXT: mov h31, v0.h[5] +; NONEON-NOSVE-NEXT: mov v6.h[2], v18.h[0] +; NONEON-NOSVE-NEXT: fmadd s17, s21, s20, s17 +; NONEON-NOSVE-NEXT: mov v7.h[1], v26.h[0] +; NONEON-NOSVE-NEXT: fcvt h18, s19 +; NONEON-NOSVE-NEXT: fmadd s19, s24, s23, s22 +; NONEON-NOSVE-NEXT: mov h26, v5.h[5] +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt s20, h25 +; NONEON-NOSVE-NEXT: fcvt s21, h27 +; NONEON-NOSVE-NEXT: fcvt s22, h28 +; NONEON-NOSVE-NEXT: mov h27, v4.h[5] +; NONEON-NOSVE-NEXT: mov h28, v3.h[5] +; NONEON-NOSVE-NEXT: fcvt s23, h29 +; NONEON-NOSVE-NEXT: fcvt s24, h30 +; NONEON-NOSVE-NEXT: fcvt s25, h31 +; NONEON-NOSVE-NEXT: mov h29, v2.h[6] +; NONEON-NOSVE-NEXT: mov h30, v1.h[6] +; NONEON-NOSVE-NEXT: mov h31, v0.h[6] +; NONEON-NOSVE-NEXT: mov v7.h[2], v18.h[0] +; NONEON-NOSVE-NEXT: fcvt h18, s19 +; NONEON-NOSVE-NEXT: fmadd s19, s22, s21, s20 +; NONEON-NOSVE-NEXT: mov h20, v5.h[6] +; NONEON-NOSVE-NEXT: mov h21, v4.h[6] +; NONEON-NOSVE-NEXT: mov h22, v3.h[6] +; NONEON-NOSVE-NEXT: fcvt s26, h26 +; NONEON-NOSVE-NEXT: fmadd s23, s25, s24, s23 +; NONEON-NOSVE-NEXT: fcvt s27, h27 +; NONEON-NOSVE-NEXT: fcvt s28, h28 +; NONEON-NOSVE-NEXT: mov v6.h[3], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s17 +; NONEON-NOSVE-NEXT: fcvt s17, h29 +; NONEON-NOSVE-NEXT: fcvt s24, h30 +; NONEON-NOSVE-NEXT: fcvt s25, h31 +; NONEON-NOSVE-NEXT: fcvt s20, h20 +; NONEON-NOSVE-NEXT: fcvt s21, h21 +; NONEON-NOSVE-NEXT: fcvt s22, h22 +; NONEON-NOSVE-NEXT: mov v7.h[3], v18.h[0] +; NONEON-NOSVE-NEXT: fmadd s26, s28, s27, s26 +; NONEON-NOSVE-NEXT: fcvt h18, s19 +; NONEON-NOSVE-NEXT: mov h5, v5.h[7] +; NONEON-NOSVE-NEXT: mov h4, v4.h[7] +; NONEON-NOSVE-NEXT: mov h3, v3.h[7] +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: fmadd s17, s25, s24, s17 +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fmadd s19, s22, s21, s20 +; NONEON-NOSVE-NEXT: mov v6.h[4], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s23 +; NONEON-NOSVE-NEXT: mov v7.h[4], v18.h[0] +; NONEON-NOSVE-NEXT: fcvt h18, s26 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v6.h[5], v16.h[0] +; NONEON-NOSVE-NEXT: mov v7.h[5], v18.h[0] +; NONEON-NOSVE-NEXT: fmadd s3, s3, s4, s5 +; NONEON-NOSVE-NEXT: fcvt h4, s19 +; NONEON-NOSVE-NEXT: fcvt h5, s17 +; NONEON-NOSVE-NEXT: fmadd s0, s0, s1, s2 +; NONEON-NOSVE-NEXT: mov v7.h[6], v4.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s3 +; NONEON-NOSVE-NEXT: mov v6.h[6], v5.h[0] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v7.h[7], v1.h[0] +; NONEON-NOSVE-NEXT: mov v6.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: stp q7, q6, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %op3 = load <16 x half>, ptr %c @@ -352,6 +819,12 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmla v2.2s, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) ret <2 x float> %res } @@ -366,6 +839,12 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmla v2.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) ret <4 x float> %res } @@ -382,6 +861,16 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: fmla z1.s, p0/m, z3.s, z4.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fmla v1.4s, v0.4s, v2.4s +; NONEON-NOSVE-NEXT: fmla v5.4s, v4.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %op3 = load <8 x float>, ptr %c @@ -400,6 +889,12 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmla v2.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) ret <2 x double> %res } @@ -416,6 +911,16 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: fmla z1.d, p0/m, z3.d, z4.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fmla v1.2d, v0.2d, v2.2d +; NONEON-NOSVE-NEXT: fmla v5.2d, v4.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %op3 = load <4 x double>, ptr %c @@ -437,6 +942,14 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) { ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmul_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fmul <2 x half> %op1, %op2 ret <2 x half> %res } @@ -450,6 +963,14 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmul_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fmul <4 x half> %op1, %op2 ret <4 x half> %res } @@ -463,6 +984,18 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmul_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fmul v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: fmul v1.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = fmul <8 x half> %op1, %op2 ret <8 x half> %res } @@ -478,6 +1011,29 @@ define void @fmul_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmul_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h +; NONEON-NOSVE-NEXT: fmul v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: fmul v5.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: fmul v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fmul v2.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s +; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %res = fmul <16 x half> %op1, %op2 @@ -494,6 +1050,11 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmul_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = fmul <2 x float> %op1, %op2 ret <2 x float> %res } @@ -507,6 +1068,11 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) { ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmul_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = fmul <4 x float> %op1, %op2 ret <4 x float> %res } @@ -522,6 +1088,15 @@ define void @fmul_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmul_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmul v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fmul v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = fmul <8 x float> %op1, %op2 @@ -538,6 +1113,11 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) { ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmul_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmul v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = fmul <2 x double> %op1, %op2 ret <2 x double> %res } @@ -553,6 +1133,15 @@ define void @fmul_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fmul z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmul_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmul v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fmul v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = fmul <4 x double> %op1, %op2 @@ -572,6 +1161,12 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) { ; CHECK-NEXT: fneg z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fneg_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v1.4h, #128, lsl #8 +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = fneg <2 x half> %op ret <2 x half> %res } @@ -584,6 +1179,12 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) { ; CHECK-NEXT: fneg z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fneg_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v1.4h, #128, lsl #8 +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = fneg <4 x half> %op ret <4 x half> %res } @@ -596,6 +1197,12 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) { ; CHECK-NEXT: fneg z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fneg_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v1.8h, #128, lsl #8 +; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = fneg <8 x half> %op ret <8 x half> %res } @@ -609,6 +1216,15 @@ define void @fneg_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fneg z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fneg_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.8h, #128, lsl #8 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = fneg <16 x half> %op store <16 x half> %res, ptr %a @@ -623,6 +1239,11 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) { ; CHECK-NEXT: fneg z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fneg_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fneg v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = fneg <2 x float> %op ret <2 x float> %res } @@ -635,6 +1256,11 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) { ; CHECK-NEXT: fneg z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fneg_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fneg <4 x float> %op ret <4 x float> %res } @@ -648,6 +1274,14 @@ define void @fneg_v8f32(ptr %a) { ; CHECK-NEXT: fneg z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fneg_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fneg v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = fneg <8 x float> %op store <8 x float> %res, ptr %a @@ -662,6 +1296,11 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) { ; CHECK-NEXT: fneg z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fneg_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fneg <2 x double> %op ret <2 x double> %res } @@ -675,6 +1314,14 @@ define void @fneg_v4f64(ptr %a) { ; CHECK-NEXT: fneg z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fneg_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fneg v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = fneg <4 x double> %op store <4 x double> %res, ptr %a @@ -693,6 +1340,30 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) { ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsqrt_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: mov h3, v0.h[2] +; NONEON-NOSVE-NEXT: mov h0, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsqrt s2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s1, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fsqrt s3, s3 +; NONEON-NOSVE-NEXT: fsqrt s4, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s2 +; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s3 +; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s4 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %op) ret <2 x half> %res } @@ -705,6 +1376,30 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) { ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsqrt_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: mov h3, v0.h[2] +; NONEON-NOSVE-NEXT: mov h0, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsqrt s2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s1, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fsqrt s3, s3 +; NONEON-NOSVE-NEXT: fsqrt s4, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s2 +; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s3 +; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s4 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op) ret <4 x half> %res } @@ -717,6 +1412,48 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) { ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsqrt_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: mov h3, v0.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[3] +; NONEON-NOSVE-NEXT: mov h5, v0.h[4] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: mov h7, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsqrt s2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h0 +; NONEON-NOSVE-NEXT: fcvt h0, s2 +; NONEON-NOSVE-NEXT: fsqrt s1, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] +; NONEON-NOSVE-NEXT: fsqrt s3, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s3 +; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] +; NONEON-NOSVE-NEXT: fsqrt s4, s4 +; NONEON-NOSVE-NEXT: fcvt h1, s4 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: fsqrt s5, s5 +; NONEON-NOSVE-NEXT: fcvt h1, s5 +; NONEON-NOSVE-NEXT: mov v0.h[4], v1.h[0] +; NONEON-NOSVE-NEXT: fsqrt s6, s6 +; NONEON-NOSVE-NEXT: fcvt h1, s6 +; NONEON-NOSVE-NEXT: mov v0.h[5], v1.h[0] +; NONEON-NOSVE-NEXT: fsqrt s7, s7 +; NONEON-NOSVE-NEXT: fcvt h1, s7 +; NONEON-NOSVE-NEXT: mov v0.h[6], v1.h[0] +; NONEON-NOSVE-NEXT: fsqrt s2, s16 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: mov v0.h[7], v1.h[0] +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op) ret <8 x half> %res } @@ -730,6 +1467,89 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fsqrt z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsqrt_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q16, [x0] +; NONEON-NOSVE-NEXT: mov h0, v1.h[1] +; NONEON-NOSVE-NEXT: mov h17, v16.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s18, h16 +; NONEON-NOSVE-NEXT: mov h19, v16.h[2] +; NONEON-NOSVE-NEXT: mov h4, v1.h[3] +; NONEON-NOSVE-NEXT: mov h20, v16.h[3] +; NONEON-NOSVE-NEXT: mov h5, v1.h[4] +; NONEON-NOSVE-NEXT: mov h21, v16.h[4] +; NONEON-NOSVE-NEXT: mov h6, v1.h[5] +; NONEON-NOSVE-NEXT: mov h22, v16.h[5] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fsqrt s2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: mov h7, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s20, h20 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s21, h21 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s22, h22 +; NONEON-NOSVE-NEXT: mov h23, v16.h[6] +; NONEON-NOSVE-NEXT: mov h16, v16.h[7] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s23, h23 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v2.h[1], v0.h[0] +; NONEON-NOSVE-NEXT: fsqrt s17, s17 +; NONEON-NOSVE-NEXT: fcvt h17, s17 +; NONEON-NOSVE-NEXT: fsqrt s18, s18 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: mov v18.h[1], v17.h[0] +; NONEON-NOSVE-NEXT: fsqrt s3, s3 +; NONEON-NOSVE-NEXT: fcvt h0, s3 +; NONEON-NOSVE-NEXT: mov v2.h[2], v0.h[0] +; NONEON-NOSVE-NEXT: fsqrt s19, s19 +; NONEON-NOSVE-NEXT: fcvt h17, s19 +; NONEON-NOSVE-NEXT: mov v18.h[2], v17.h[0] +; NONEON-NOSVE-NEXT: fsqrt s4, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s4 +; NONEON-NOSVE-NEXT: mov v2.h[3], v0.h[0] +; NONEON-NOSVE-NEXT: fsqrt s20, s20 +; NONEON-NOSVE-NEXT: fcvt h3, s20 +; NONEON-NOSVE-NEXT: mov v18.h[3], v3.h[0] +; NONEON-NOSVE-NEXT: fsqrt s5, s5 +; NONEON-NOSVE-NEXT: fcvt h0, s5 +; NONEON-NOSVE-NEXT: mov v2.h[4], v0.h[0] +; NONEON-NOSVE-NEXT: fsqrt s21, s21 +; NONEON-NOSVE-NEXT: fcvt h3, s21 +; NONEON-NOSVE-NEXT: mov v18.h[4], v3.h[0] +; NONEON-NOSVE-NEXT: fsqrt s6, s6 +; NONEON-NOSVE-NEXT: fcvt h0, s6 +; NONEON-NOSVE-NEXT: mov v2.h[5], v0.h[0] +; NONEON-NOSVE-NEXT: fsqrt s22, s22 +; NONEON-NOSVE-NEXT: fcvt h3, s22 +; NONEON-NOSVE-NEXT: mov v18.h[5], v3.h[0] +; NONEON-NOSVE-NEXT: fsqrt s7, s7 +; NONEON-NOSVE-NEXT: fcvt h0, s7 +; NONEON-NOSVE-NEXT: mov v2.h[6], v0.h[0] +; NONEON-NOSVE-NEXT: fsqrt s23, s23 +; NONEON-NOSVE-NEXT: fcvt h3, s23 +; NONEON-NOSVE-NEXT: mov v18.h[6], v3.h[0] +; NONEON-NOSVE-NEXT: fsqrt s16, s16 +; NONEON-NOSVE-NEXT: fcvt h3, s16 +; NONEON-NOSVE-NEXT: mov v18.h[7], v3.h[0] +; NONEON-NOSVE-NEXT: fsqrt s1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: stp q18, q2, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op) store <16 x half> %res, ptr %a @@ -744,6 +1564,11 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) { ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsqrt_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fsqrt v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op) ret <2 x float> %res } @@ -756,6 +1581,11 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) { ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsqrt_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fsqrt v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op) ret <4 x float> %res } @@ -769,6 +1599,14 @@ define void @fsqrt_v8f32(ptr %a) { ; CHECK-NEXT: fsqrt z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsqrt_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fsqrt v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fsqrt v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op) store <8 x float> %res, ptr %a @@ -783,6 +1621,11 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) { ; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsqrt_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fsqrt v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op) ret <2 x double> %res } @@ -796,6 +1639,14 @@ define void @fsqrt_v4f64(ptr %a) { ; CHECK-NEXT: fsqrt z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsqrt_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fsqrt v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fsqrt v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op) store <4 x double> %res, ptr %a @@ -815,6 +1666,14 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) { ; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsub_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fsub v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fsub <2 x half> %op1, %op2 ret <2 x half> %res } @@ -828,6 +1687,14 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsub_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fsub v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fsub <4 x half> %op1, %op2 ret <4 x half> %res } @@ -841,6 +1708,18 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsub_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fsub v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: fsub v1.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = fsub <8 x half> %op1, %op2 ret <8 x half> %res } @@ -856,6 +1735,29 @@ define void @fsub_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fsub z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsub_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h +; NONEON-NOSVE-NEXT: fsub v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: fsub v5.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: fsub v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fsub v2.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s +; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %res = fsub <16 x half> %op1, %op2 @@ -872,6 +1774,11 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsub_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fsub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = fsub <2 x float> %op1, %op2 ret <2 x float> %res } @@ -885,6 +1792,11 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) { ; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsub_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fsub v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = fsub <4 x float> %op1, %op2 ret <4 x float> %res } @@ -900,6 +1812,15 @@ define void @fsub_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsub_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fsub v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fsub v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = fsub <8 x float> %op1, %op2 @@ -916,6 +1837,11 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) { ; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsub_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fsub v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = fsub <2 x double> %op1, %op2 ret <2 x double> %res } @@ -931,6 +1857,15 @@ define void @fsub_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fsub z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fsub_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fsub v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fsub v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = fsub <4 x double> %op1, %op2 @@ -950,6 +1885,11 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) { ; CHECK-NEXT: fabs z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fabs_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: bic v0.4h, #128, lsl #8 +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.fabs.v2f16(<2 x half> %op) ret <2 x half> %res } @@ -962,6 +1902,11 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) { ; CHECK-NEXT: fabs z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fabs_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: bic v0.4h, #128, lsl #8 +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op) ret <4 x half> %res } @@ -974,6 +1919,11 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) { ; CHECK-NEXT: fabs z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fabs_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: bic v0.8h, #128, lsl #8 +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op) ret <8 x half> %res } @@ -987,6 +1937,14 @@ define void @fabs_v16f16(ptr %a) { ; CHECK-NEXT: fabs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fabs_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: bic v0.8h, #128, lsl #8 +; NONEON-NOSVE-NEXT: bic v1.8h, #128, lsl #8 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op) store <16 x half> %res, ptr %a @@ -1001,6 +1959,11 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) { ; CHECK-NEXT: fabs z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fabs_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fabs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op) ret <2 x float> %res } @@ -1013,6 +1976,11 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) { ; CHECK-NEXT: fabs z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fabs_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fabs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op) ret <4 x float> %res } @@ -1026,6 +1994,14 @@ define void @fabs_v8f32(ptr %a) { ; CHECK-NEXT: fabs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fabs_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fabs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fabs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op) store <8 x float> %res, ptr %a @@ -1040,6 +2016,11 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) { ; CHECK-NEXT: fabs z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fabs_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fabs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op) ret <2 x double> %res } @@ -1053,6 +2034,14 @@ define void @fabs_v4f64(ptr %a) { ; CHECK-NEXT: fabs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fabs_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fabs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fabs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op) store <4 x double> %res, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll index cbd0ad66fba767..c5ed70c8a5f2f8 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -19,6 +20,14 @@ define <2 x i16> @fcmp_oeq_v2f16(<2 x half> %op1, <2 x half> %op2) { ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <2 x half> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i16> ret <2 x i16> %sext @@ -34,6 +43,14 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <4 x half> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i16> ret <4 x i16> %sext @@ -49,6 +66,65 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: mov h6, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[2] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h6 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: mov h4, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov h6, v0.h[4] +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s2, s5 +; NONEON-NOSVE-NEXT: fmov s2, w9 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h5, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v2.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[5] +; NONEON-NOSVE-NEXT: mov h4, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[6] +; NONEON-NOSVE-NEXT: mov h6, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: fcvt s3, h5 +; NONEON-NOSVE-NEXT: fcvt s4, h6 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <8 x half> %op1, %op2 %sext = sext <8 x i1> %cmp to <8 x i16> ret <8 x i16> %sext @@ -66,6 +142,123 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, eq +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, eq +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, eq +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, eq +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, eq +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, eq +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, eq +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp oeq <16 x half> %op1, %op2 @@ -84,6 +277,11 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcmeq v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <2 x float> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i32> ret <2 x i32> %sext @@ -99,6 +297,11 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) { ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <4 x float> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %sext @@ -116,6 +319,15 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fcmeq v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fcmeq v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %cmp = fcmp oeq <8 x float> %op1, %op2 @@ -132,6 +344,11 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) { ; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcmeq d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <1 x double> %op1, %op2 %sext = sext <1 x i1> %cmp to <1 x i64> ret <1 x i64> %sext @@ -147,6 +364,11 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) { ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcmeq v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <2 x double> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i64> ret <2 x i64> %sext @@ -164,6 +386,15 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oeq_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fcmeq v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fcmeq v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %cmp = fcmp oeq <4 x double> %op1, %op2 @@ -192,6 +423,139 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ueq_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h2 +; NONEON-NOSVE-NEXT: mov h5, v2.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h1 +; NONEON-NOSVE-NEXT: mov h7, v1.h[2] +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s6, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[4] +; NONEON-NOSVE-NEXT: mov h6, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: csinv w12, w9, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s7, s5 +; NONEON-NOSVE-NEXT: mov h5, v2.h[5] +; NONEON-NOSVE-NEXT: mov h7, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: csinv w10, w9, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: csinv w11, w9, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s6, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s6, h16 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: csinv w9, w9, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s7, s5 +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: mov h7, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w13, eq +; NONEON-NOSVE-NEXT: csinv w13, w13, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s6, s3 +; NONEON-NOSVE-NEXT: fcvt s3, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h7 +; NONEON-NOSVE-NEXT: mov h6, v0.h[2] +; NONEON-NOSVE-NEXT: mov h7, v1.h[2] +; NONEON-NOSVE-NEXT: csetm w14, eq +; NONEON-NOSVE-NEXT: csinv w14, w14, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s4, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w15, eq +; NONEON-NOSVE-NEXT: csinv w15, w15, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s5, s3 +; NONEON-NOSVE-NEXT: mov h3, v0.h[3] +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w16, eq +; NONEON-NOSVE-NEXT: csinv w16, w16, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s4, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h3 +; NONEON-NOSVE-NEXT: fmov s2, w12 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w17, eq +; NONEON-NOSVE-NEXT: csinv w17, w17, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v0.h[4] +; NONEON-NOSVE-NEXT: fmov s3, w17 +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: mov v3.h[1], w16 +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v0.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: mov v2.h[2], w10 +; NONEON-NOSVE-NEXT: mov v3.h[2], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v0.h[6] +; NONEON-NOSVE-NEXT: mov h7, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w11 +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov v3.h[3], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: fcvt s4, h6 +; NONEON-NOSVE-NEXT: fcvt s5, h7 +; NONEON-NOSVE-NEXT: mov v2.h[4], w9 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v3.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov v2.h[5], w13 +; NONEON-NOSVE-NEXT: mov v3.h[5], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: mov v2.h[6], w14 +; NONEON-NOSVE-NEXT: mov v3.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: mov v2.h[7], w15 +; NONEON-NOSVE-NEXT: mov v3.h[7], w8 +; NONEON-NOSVE-NEXT: stp q3, q2, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp ueq <16 x half> %op1, %op2 @@ -220,6 +584,139 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_one_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h2 +; NONEON-NOSVE-NEXT: mov h5, v2.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h1 +; NONEON-NOSVE-NEXT: mov h7, v1.h[2] +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: fcmp s6, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[4] +; NONEON-NOSVE-NEXT: mov h6, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w9, mi +; NONEON-NOSVE-NEXT: csinv w12, w9, wzr, le +; NONEON-NOSVE-NEXT: fcmp s7, s5 +; NONEON-NOSVE-NEXT: mov h5, v2.h[5] +; NONEON-NOSVE-NEXT: mov h7, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w9, mi +; NONEON-NOSVE-NEXT: csinv w10, w9, wzr, le +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: csetm w9, mi +; NONEON-NOSVE-NEXT: csinv w11, w9, wzr, le +; NONEON-NOSVE-NEXT: fcmp s6, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s6, h16 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w9, mi +; NONEON-NOSVE-NEXT: csinv w9, w9, wzr, le +; NONEON-NOSVE-NEXT: fcmp s7, s5 +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: mov h7, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w13, mi +; NONEON-NOSVE-NEXT: csinv w13, w13, wzr, le +; NONEON-NOSVE-NEXT: fcmp s6, s3 +; NONEON-NOSVE-NEXT: fcvt s3, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h7 +; NONEON-NOSVE-NEXT: mov h6, v0.h[2] +; NONEON-NOSVE-NEXT: mov h7, v1.h[2] +; NONEON-NOSVE-NEXT: csetm w14, mi +; NONEON-NOSVE-NEXT: csinv w14, w14, wzr, le +; NONEON-NOSVE-NEXT: fcmp s4, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w15, mi +; NONEON-NOSVE-NEXT: csinv w15, w15, wzr, le +; NONEON-NOSVE-NEXT: fcmp s5, s3 +; NONEON-NOSVE-NEXT: mov h3, v0.h[3] +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w16, mi +; NONEON-NOSVE-NEXT: csinv w16, w16, wzr, le +; NONEON-NOSVE-NEXT: fcmp s4, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h3 +; NONEON-NOSVE-NEXT: fmov s2, w12 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w17, mi +; NONEON-NOSVE-NEXT: csinv w17, w17, wzr, le +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v0.h[4] +; NONEON-NOSVE-NEXT: fmov s3, w17 +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: mov v3.h[1], w16 +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v0.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: mov v2.h[2], w10 +; NONEON-NOSVE-NEXT: mov v3.h[2], w8 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v0.h[6] +; NONEON-NOSVE-NEXT: mov h7, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w11 +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov v3.h[3], w8 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: fcvt s4, h6 +; NONEON-NOSVE-NEXT: fcvt s5, h7 +; NONEON-NOSVE-NEXT: mov v2.h[4], w9 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v3.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov v2.h[5], w13 +; NONEON-NOSVE-NEXT: mov v3.h[5], w8 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: mov v2.h[6], w14 +; NONEON-NOSVE-NEXT: mov v3.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: mov v2.h[7], w15 +; NONEON-NOSVE-NEXT: mov v3.h[7], w8 +; NONEON-NOSVE-NEXT: stp q3, q2, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp one <16 x half> %op1, %op2 @@ -244,6 +741,123 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_une_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, ne +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, ne +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, ne +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, ne +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, ne +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, ne +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, ne +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, ne +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp une <16 x half> %op1, %op2 @@ -268,6 +882,123 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ogt_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, gt +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, gt +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, gt +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, gt +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, gt +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, gt +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, gt +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, gt +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp ogt <16 x half> %op1, %op2 @@ -295,6 +1026,123 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: eor z0.d, z2.d, z0.d ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ugt_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, hi +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, hi +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, hi +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, hi +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, hi +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, hi +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, hi +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, hi +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp ugt <16 x half> %op1, %op2 @@ -319,6 +1167,123 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_olt_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, mi +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, mi +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, mi +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, mi +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, mi +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, mi +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, mi +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, mi +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, mi +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp olt <16 x half> %op1, %op2 @@ -346,6 +1311,123 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: eor z0.d, z2.d, z0.d ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ult_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, lt +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, lt +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, lt +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, lt +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, lt +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, lt +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, lt +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, lt +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp ult <16 x half> %op1, %op2 @@ -370,6 +1452,123 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_oge_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, ge +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, ge +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, ge +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, ge +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, ge +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, ge +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, ge +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, ge +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, ge +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp oge <16 x half> %op1, %op2 @@ -397,6 +1596,123 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: eor z0.d, z2.d, z0.d ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_uge_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, pl +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, pl +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, pl +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, pl +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, pl +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, pl +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, pl +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, pl +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, pl +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp uge <16 x half> %op1, %op2 @@ -421,6 +1737,123 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ole_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, ls +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, ls +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, ls +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, ls +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, ls +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, ls +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, ls +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, ls +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, ls +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp ole <16 x half> %op1, %op2 @@ -448,6 +1881,123 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: eor z0.d, z2.d, z0.d ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ule_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, le +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, le +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, le +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, le +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, le +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, le +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, le +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, le +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, le +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp ule <16 x half> %op1, %op2 @@ -472,6 +2022,123 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_uno_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, vs +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, vs +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, vs +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, vs +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, vs +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, vs +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, vs +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, vs +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, vs +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp uno <16 x half> %op1, %op2 @@ -499,6 +2166,123 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: eor z0.d, z2.d, z0.d ; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ord_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, vc +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, vc +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, vc +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, vc +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, vc +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, vc +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, vc +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, vc +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, vc +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp ord <16 x half> %op1, %op2 @@ -523,6 +2307,123 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_eq_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, eq +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, eq +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, eq +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, eq +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, eq +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, eq +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, eq +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp fast oeq <16 x half> %op1, %op2 @@ -547,6 +2448,123 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ne_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, ne +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, ne +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, ne +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, ne +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, ne +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, ne +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, ne +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, ne +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp fast one <16 x half> %op1, %op2 @@ -571,6 +2589,123 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_gt_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, gt +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, gt +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, gt +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, gt +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, gt +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, gt +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, gt +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, gt +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp fast ogt <16 x half> %op1, %op2 @@ -595,6 +2730,123 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_lt_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, lt +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, lt +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, lt +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, lt +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, lt +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, lt +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, lt +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, lt +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp fast olt <16 x half> %op1, %op2 @@ -619,6 +2871,123 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ge_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, ge +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, ge +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, ge +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, ge +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, ge +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, ge +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, ge +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, ge +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, ge +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp fast oge <16 x half> %op1, %op2 @@ -643,6 +3012,123 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_le_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: mov h0, v2.h[1] +; NONEON-NOSVE-NEXT: mov h3, v1.h[1] +; NONEON-NOSVE-NEXT: mov h4, v2.h[2] +; NONEON-NOSVE-NEXT: mov h5, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h2 +; NONEON-NOSVE-NEXT: fcvt s7, h1 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h0, v2.h[3] +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[4] +; NONEON-NOSVE-NEXT: mov h7, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w12, le +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v2.h[5] +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w11, le +; NONEON-NOSVE-NEXT: fcmp s3, s0 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w9, le +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: csetm w10, le +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[1] +; NONEON-NOSVE-NEXT: mov h5, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: csetm w13, le +; NONEON-NOSVE-NEXT: fcmp s7, s3 +; NONEON-NOSVE-NEXT: fmov s7, w12 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: csetm w14, le +; NONEON-NOSVE-NEXT: fcmp s6, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: mov v7.h[1], w8 +; NONEON-NOSVE-NEXT: csetm w15, le +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h4, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: mov v7.h[2], w11 +; NONEON-NOSVE-NEXT: csetm w16, le +; NONEON-NOSVE-NEXT: fcmp s5, s2 +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: csetm w17, le +; NONEON-NOSVE-NEXT: mov v7.h[3], w9 +; NONEON-NOSVE-NEXT: fmov s2, w17 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: mov h4, v0.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[1], w16 +; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: mov h5, v1.h[5] +; NONEON-NOSVE-NEXT: mov h6, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: mov h4, v0.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcmp s6, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: fcmp s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %cmp = fcmp fast ole <16 x half> %op1, %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll index 57d072a7bcd68b..055af194be211a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -17,6 +18,17 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fp_convert_combine_crash: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov v0.4s, #8.00000000 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmul v1.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fmul v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %f = load <8 x float>, ptr %a %mul.i = fmul <8 x float> %f, diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll index 6a2dc3c7182527..ce8902cfa16c3d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -17,6 +18,12 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) { ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v2f16_to_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %res = fpext <2 x half> %a to <2 x float> store <2 x float> %res, ptr %b ret void @@ -31,6 +38,12 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) { ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v4f16_to_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %res = fpext <4 x half> %a to <4 x float> store <4 x float> %res, ptr %b ret void @@ -48,6 +61,17 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) { ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = fpext <8 x half> %a to <8 x float> store <8 x float> %res, ptr %b ret void @@ -72,6 +96,21 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) { ; CHECK-NEXT: stp q3, q0, [x0] ; CHECK-NEXT: stp q2, q1, [x0, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h +; NONEON-NOSVE-NEXT: stp q0, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %res = fpext <16 x half> %a to <16 x float> store <16 x float> %res, ptr %b ret void @@ -90,6 +129,13 @@ define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x half>, ptr %a %res = fpext <2 x half> %op1 to <2 x float> store <2 x float> %res, ptr %b @@ -104,6 +150,13 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fpext <4 x half> %op1 to <4 x float> store <4 x float> %res, ptr %b @@ -121,6 +174,18 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z1.s, p0/m, z1.h ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fpext <8 x half> %op1 to <8 x float> store <8 x float> %res, ptr %b @@ -145,6 +210,22 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) { ; CHECK-NEXT: stp q0, q1, [x1, #32] ; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fpext <16 x half> %op1 to <16 x float> store <16 x float> %res, ptr %b @@ -162,6 +243,13 @@ define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt d0, h0 ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v1f16_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <1 x half>, ptr %a %res = fpext <1 x half> %op1 to <1 x double> store <1 x double> %res, ptr %b @@ -176,6 +264,14 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z0.d, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x half>, ptr %a %res = fpext <2 x half> %op1 to <2 x double> store <2 x double> %res, ptr %b @@ -193,6 +289,19 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z1.d, p0/m, z1.h ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fpext <4 x half> %op1 to <4 x double> store <4 x double> %res, ptr %b @@ -217,6 +326,26 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q0, q1, [x1, #32] ; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s +; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s +; NONEON-NOSVE-NEXT: stp q0, q2, [x1] +; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fpext <8 x half> %op1 to <8 x double> store <8 x double> %res, ptr %b @@ -258,6 +387,38 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q4, q0, [x1, #32] ; CHECK-NEXT: stp q1, q2, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] +; NONEON-NOSVE-NEXT: ldr d6, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtl v5.2d, v5.2s +; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s +; NONEON-NOSVE-NEXT: fcvtl v4.2d, v4.2s +; NONEON-NOSVE-NEXT: stp q0, q5, [x1] +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v7.2s +; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v6.2s +; NONEON-NOSVE-NEXT: stp q2, q0, [x1, #32] +; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fpext <16 x half> %op1 to <16 x double> store <16 x double> %res, ptr %b @@ -275,6 +436,13 @@ define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt d0, s0 ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v1f32_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <1 x float>, ptr %a %res = fpext <1 x float> %op1 to <1 x double> store <1 x double> %res, ptr %b @@ -289,6 +457,13 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z0.d, p0/m, z0.s ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x float>, ptr %a %res = fpext <2 x float> %op1 to <2 x double> store <2 x double> %res, ptr %b @@ -306,6 +481,18 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z1.d, p0/m, z1.s ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fpext <4 x float> %op1 to <4 x double> store <4 x double> %res, ptr %b @@ -330,6 +517,22 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q0, q1, [x1, #32] ; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s +; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fpext <8 x float> %op1 to <8 x double> store <8 x double> %res, ptr %b @@ -348,6 +551,13 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x float>, ptr %a %res = fptrunc <2 x float> %op1 to <2 x half> store <2 x half> %res, ptr %b @@ -362,6 +572,13 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fptrunc <4 x float> %op1 to <4 x half> store <4 x half> %res, ptr %b @@ -379,6 +596,14 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) { ; CHECK-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1] ; CHECK-NEXT: st1h { z1.s }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptrunc <8 x float> %op1 to <8 x half> store <8 x half> %res, ptr %b @@ -397,6 +622,13 @@ define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: st1h { z0.d }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: str h0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <1 x double>, ptr %a %res = fptrunc <1 x double> %op1 to <1 x half> store <1 x half> %res, ptr %b @@ -411,6 +643,16 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) { ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: st1h { z0.d }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: mov d1, v0.d[1] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: fcvt h1, d1 +; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] +; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x double>, ptr %a %res = fptrunc <2 x double> %op1 to <2 x half> store <2 x half> %res, ptr %b @@ -428,6 +670,21 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) { ; CHECK-NEXT: st1h { z0.d }, p0, [x1, x8, lsl #1] ; CHECK-NEXT: st1h { z1.d }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: mov d1, v0.d[1] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: fcvt h1, d1 +; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, d2 +; NONEON-NOSVE-NEXT: mov d2, v2.d[1] +; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, d2 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptrunc <4 x double> %op1 to <4 x half> store <4 x half> %res, ptr %b @@ -446,6 +703,13 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) { ; CHECK-NEXT: fcvt z0.s, p0/m, z0.d ; CHECK-NEXT: st1w { z0.d }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str s0, [x0] +; NONEON-NOSVE-NEXT: ret %res = fptrunc <1 x double> %op1 to <1 x float> store <1 x float> %res, ptr %b ret void @@ -459,6 +723,12 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) { ; CHECK-NEXT: fcvt z0.s, p0/m, z0.d ; CHECK-NEXT: st1w { z0.d }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %res = fptrunc <2 x double> %op1 to <2 x float> store <2 x float> %res, ptr %b ret void @@ -475,6 +745,14 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) { ; CHECK-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2] ; CHECK-NEXT: st1w { z1.d }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptrunc <4 x double> %op1 to <4 x float> store <4 x float> %res, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll index 153a04f4865715..9d2b55903f3141 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -17,6 +18,18 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %mul = fmul contract <4 x half> %op1, %op2 %res = fadd contract <4 x half> %mul, %op3 ret <4 x half> %res @@ -32,6 +45,26 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fmul v3.4s, v4.4s, v3.4s +; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v3.4s +; NONEON-NOSVE-NEXT: fadd v1.4s, v1.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %mul = fmul contract <8 x half> %op1, %op2 %res = fadd contract <8 x half> %mul, %op3 ret <8 x half> %res @@ -49,6 +82,46 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: fmla z1.h, p0/m, z3.h, z4.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: fcvtl v5.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl v4.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h +; NONEON-NOSVE-NEXT: fmul v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: fmul v5.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fmul v2.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s +; NONEON-NOSVE-NEXT: ldp q0, q2, [x2] +; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v6.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl v7.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h +; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h +; NONEON-NOSVE-NEXT: fadd v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: fadd v5.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fadd v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s +; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %op3 = load <16 x half>, ptr %c @@ -68,6 +141,12 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmla v2.2s, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ret %mul = fmul contract <2 x float> %op1, %op2 %res = fadd contract <2 x float> %mul, %op3 ret <2 x float> %res @@ -83,6 +162,12 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmla v2.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %mul = fmul contract <4 x float> %op1, %op2 %res = fadd contract <4 x float> %mul, %op3 ret <4 x float> %res @@ -100,6 +185,16 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: fmla z1.s, p0/m, z3.s, z4.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fmla v1.4s, v0.4s, v2.4s +; NONEON-NOSVE-NEXT: fmla v5.4s, v4.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %op3 = load <8 x float>, ptr %c @@ -114,6 +209,11 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double ; CHECK: // %bb.0: ; CHECK-NEXT: fmadd d0, d0, d1, d2 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmadd d0, d0, d1, d2 +; NONEON-NOSVE-NEXT: ret %mul = fmul contract <1 x double> %op1, %op2 %res = fadd contract <1 x double> %mul, %op3 ret <1 x double> %res @@ -129,6 +229,12 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmla v2.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %mul = fmul contract <2 x double> %op1, %op2 %res = fadd contract <2 x double> %mul, %op3 ret <2 x double> %res @@ -146,6 +252,16 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: fmla z1.d, p0/m, z3.d, z4.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fma_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fmla v1.2d, v0.2d, v2.2d +; NONEON-NOSVE-NEXT: fmla v5.2d, v4.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %op3 = load <4 x double>, ptr %c diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll index 6945a6102c0553..a96adfec2ad105 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -17,6 +18,38 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxnm_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: mov h4, v1.h[2] +; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h1 +; NONEON-NOSVE-NEXT: fcvt s7, h0 +; NONEON-NOSVE-NEXT: mov h1, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fmaxnm s5, s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: fmaxnm s3, s4, s3 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s5 +; NONEON-NOSVE-NEXT: fcvt s4, h6 +; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: fmaxnm s1, s4, s1 +; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res } @@ -30,6 +63,64 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxnm_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: mov h6, v1.h[2] +; NONEON-NOSVE-NEXT: mov h7, v0.h[2] +; NONEON-NOSVE-NEXT: mov h16, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fmaxnm s3, s3, s2 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s4 +; NONEON-NOSVE-NEXT: fmaxnm s4, s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[4] +; NONEON-NOSVE-NEXT: mov h7, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fmaxnm s5, s5, s16 +; NONEON-NOSVE-NEXT: mov h16, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt s3, h6 +; NONEON-NOSVE-NEXT: fcvt s6, h7 +; NONEON-NOSVE-NEXT: mov h7, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] +; NONEON-NOSVE-NEXT: mov h4, v1.h[6] +; NONEON-NOSVE-NEXT: fmaxnm s3, s6, s3 +; NONEON-NOSVE-NEXT: mov h6, v0.h[6] +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h6 +; NONEON-NOSVE-NEXT: fmaxnm s6, s16, s7 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] +; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h3, s6 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] +; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res } @@ -45,6 +136,119 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fmaxnm z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxnm_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: mov h7, v0.h[1] +; NONEON-NOSVE-NEXT: mov h16, v0.h[2] +; NONEON-NOSVE-NEXT: mov h18, v2.h[1] +; NONEON-NOSVE-NEXT: mov h5, v1.h[1] +; NONEON-NOSVE-NEXT: mov h6, v1.h[2] +; NONEON-NOSVE-NEXT: mov h17, v3.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s19, h0 +; NONEON-NOSVE-NEXT: fcvt s20, h3 +; NONEON-NOSVE-NEXT: fcvt s21, h2 +; NONEON-NOSVE-NEXT: mov h22, v3.h[2] +; NONEON-NOSVE-NEXT: mov h23, v2.h[2] +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fmaxnm s4, s19, s4 +; NONEON-NOSVE-NEXT: mov h19, v0.h[3] +; NONEON-NOSVE-NEXT: mov h24, v3.h[3] +; NONEON-NOSVE-NEXT: fmaxnm s20, s21, s20 +; NONEON-NOSVE-NEXT: fcvt s21, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: mov h23, v2.h[3] +; NONEON-NOSVE-NEXT: mov h25, v2.h[6] +; NONEON-NOSVE-NEXT: fmaxnm s5, s7, s5 +; NONEON-NOSVE-NEXT: mov h7, v1.h[3] +; NONEON-NOSVE-NEXT: fmaxnm s6, s16, s6 +; NONEON-NOSVE-NEXT: fmaxnm s16, s18, s17 +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s18, h19 +; NONEON-NOSVE-NEXT: fcvt s19, h24 +; NONEON-NOSVE-NEXT: mov h24, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h17, s5 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt h5, s20 +; NONEON-NOSVE-NEXT: fmaxnm s20, s22, s21 +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt s21, h23 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: mov h22, v0.h[4] +; NONEON-NOSVE-NEXT: mov h23, v2.h[4] +; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] +; NONEON-NOSVE-NEXT: mov h17, v1.h[4] +; NONEON-NOSVE-NEXT: fmaxnm s7, s18, s7 +; NONEON-NOSVE-NEXT: mov h18, v3.h[4] +; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s20 +; NONEON-NOSVE-NEXT: fmaxnm s19, s21, s19 +; NONEON-NOSVE-NEXT: fcvt s20, h23 +; NONEON-NOSVE-NEXT: mov h21, v1.h[5] +; NONEON-NOSVE-NEXT: mov h23, v2.h[5] +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] +; NONEON-NOSVE-NEXT: fcvt s6, h17 +; NONEON-NOSVE-NEXT: fcvt s17, h22 +; NONEON-NOSVE-NEXT: fcvt h7, s7 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: mov h22, v3.h[5] +; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s19 +; NONEON-NOSVE-NEXT: mov h19, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmaxnm s6, s17, s6 +; NONEON-NOSVE-NEXT: mov h17, v1.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fmaxnm s18, s20, s18 +; NONEON-NOSVE-NEXT: mov h20, v3.h[6] +; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] +; NONEON-NOSVE-NEXT: fcvt s7, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt s16, h21 +; NONEON-NOSVE-NEXT: fcvt s21, h24 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fcvt s23, h25 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: fcvt s20, h20 +; NONEON-NOSVE-NEXT: mov h3, v3.h[7] +; NONEON-NOSVE-NEXT: fmaxnm s7, s22, s7 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s16, s21, s16 +; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] +; NONEON-NOSVE-NEXT: fmaxnm s6, s19, s17 +; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] +; NONEON-NOSVE-NEXT: fmaxnm s17, s23, s20 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h7, s7 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h3, s17 +; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] +; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] +; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] +; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %res = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %op1, <16 x half> %op2) @@ -61,6 +265,11 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxnm_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxnm v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res } @@ -74,6 +283,11 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) { ; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxnm_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res } @@ -89,6 +303,15 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fmaxnm z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxnm_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmaxnm v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fmaxnm v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %op1, <8 x float> %op2) @@ -101,6 +324,11 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) { ; CHECK: // %bb.0: ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxnm_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxnm d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res } @@ -114,6 +342,11 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) { ; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxnm_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxnm v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res } @@ -129,6 +362,15 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fmaxnm z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxnm_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmaxnm v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fmaxnm v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %op1, <4 x double> %op2) @@ -149,6 +391,38 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminnm_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: mov h4, v1.h[2] +; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h1 +; NONEON-NOSVE-NEXT: fcvt s7, h0 +; NONEON-NOSVE-NEXT: mov h1, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fminnm s5, s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: fminnm s3, s4, s3 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s5 +; NONEON-NOSVE-NEXT: fcvt s4, h6 +; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: fminnm s1, s4, s1 +; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res } @@ -162,6 +436,64 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminnm_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: mov h6, v1.h[2] +; NONEON-NOSVE-NEXT: mov h7, v0.h[2] +; NONEON-NOSVE-NEXT: mov h16, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fminnm s3, s3, s2 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s4 +; NONEON-NOSVE-NEXT: fminnm s4, s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[4] +; NONEON-NOSVE-NEXT: mov h7, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fminnm s5, s5, s16 +; NONEON-NOSVE-NEXT: mov h16, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt s3, h6 +; NONEON-NOSVE-NEXT: fcvt s6, h7 +; NONEON-NOSVE-NEXT: mov h7, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] +; NONEON-NOSVE-NEXT: mov h4, v1.h[6] +; NONEON-NOSVE-NEXT: fminnm s3, s6, s3 +; NONEON-NOSVE-NEXT: mov h6, v0.h[6] +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h6 +; NONEON-NOSVE-NEXT: fminnm s6, s16, s7 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] +; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h3, s6 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] +; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res } @@ -177,6 +509,119 @@ define void @fminnm_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminnm_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: mov h7, v0.h[1] +; NONEON-NOSVE-NEXT: mov h16, v0.h[2] +; NONEON-NOSVE-NEXT: mov h18, v2.h[1] +; NONEON-NOSVE-NEXT: mov h5, v1.h[1] +; NONEON-NOSVE-NEXT: mov h6, v1.h[2] +; NONEON-NOSVE-NEXT: mov h17, v3.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s19, h0 +; NONEON-NOSVE-NEXT: fcvt s20, h3 +; NONEON-NOSVE-NEXT: fcvt s21, h2 +; NONEON-NOSVE-NEXT: mov h22, v3.h[2] +; NONEON-NOSVE-NEXT: mov h23, v2.h[2] +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fminnm s4, s19, s4 +; NONEON-NOSVE-NEXT: mov h19, v0.h[3] +; NONEON-NOSVE-NEXT: mov h24, v3.h[3] +; NONEON-NOSVE-NEXT: fminnm s20, s21, s20 +; NONEON-NOSVE-NEXT: fcvt s21, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: mov h23, v2.h[3] +; NONEON-NOSVE-NEXT: mov h25, v2.h[6] +; NONEON-NOSVE-NEXT: fminnm s5, s7, s5 +; NONEON-NOSVE-NEXT: mov h7, v1.h[3] +; NONEON-NOSVE-NEXT: fminnm s6, s16, s6 +; NONEON-NOSVE-NEXT: fminnm s16, s18, s17 +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s18, h19 +; NONEON-NOSVE-NEXT: fcvt s19, h24 +; NONEON-NOSVE-NEXT: mov h24, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h17, s5 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt h5, s20 +; NONEON-NOSVE-NEXT: fminnm s20, s22, s21 +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt s21, h23 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: mov h22, v0.h[4] +; NONEON-NOSVE-NEXT: mov h23, v2.h[4] +; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] +; NONEON-NOSVE-NEXT: mov h17, v1.h[4] +; NONEON-NOSVE-NEXT: fminnm s7, s18, s7 +; NONEON-NOSVE-NEXT: mov h18, v3.h[4] +; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s20 +; NONEON-NOSVE-NEXT: fminnm s19, s21, s19 +; NONEON-NOSVE-NEXT: fcvt s20, h23 +; NONEON-NOSVE-NEXT: mov h21, v1.h[5] +; NONEON-NOSVE-NEXT: mov h23, v2.h[5] +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] +; NONEON-NOSVE-NEXT: fcvt s6, h17 +; NONEON-NOSVE-NEXT: fcvt s17, h22 +; NONEON-NOSVE-NEXT: fcvt h7, s7 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: mov h22, v3.h[5] +; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s19 +; NONEON-NOSVE-NEXT: mov h19, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fminnm s6, s17, s6 +; NONEON-NOSVE-NEXT: mov h17, v1.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fminnm s18, s20, s18 +; NONEON-NOSVE-NEXT: mov h20, v3.h[6] +; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] +; NONEON-NOSVE-NEXT: fcvt s7, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt s16, h21 +; NONEON-NOSVE-NEXT: fcvt s21, h24 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fcvt s23, h25 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: fcvt s20, h20 +; NONEON-NOSVE-NEXT: mov h3, v3.h[7] +; NONEON-NOSVE-NEXT: fminnm s7, s22, s7 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s16, s21, s16 +; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] +; NONEON-NOSVE-NEXT: fminnm s6, s19, s17 +; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] +; NONEON-NOSVE-NEXT: fminnm s17, s23, s20 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h7, s7 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h3, s17 +; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] +; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] +; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] +; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %res = call <16 x half> @llvm.minnum.v16f16(<16 x half> %op1, <16 x half> %op2) @@ -193,6 +638,11 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminnm_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminnm v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res } @@ -206,6 +656,11 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) { ; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminnm_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminnm v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res } @@ -221,6 +676,15 @@ define void @fminnm_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminnm_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fminnm v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fminnm v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = call <8 x float> @llvm.minnum.v8f32(<8 x float> %op1, <8 x float> %op2) @@ -233,6 +697,11 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) { ; CHECK: // %bb.0: ; CHECK-NEXT: fminnm d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminnm_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminnm d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res } @@ -246,6 +715,11 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) { ; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminnm_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminnm v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res } @@ -261,6 +735,15 @@ define void @fminnm_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminnm_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fminnm v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fminnm v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = call <4 x double> @llvm.minnum.v4f64(<4 x double> %op1, <4 x double> %op2) @@ -281,6 +764,38 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmax_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: mov h4, v1.h[2] +; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h1 +; NONEON-NOSVE-NEXT: fcvt s7, h0 +; NONEON-NOSVE-NEXT: mov h1, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s2, s3, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fmax s5, s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: fmax s3, s4, s3 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s5 +; NONEON-NOSVE-NEXT: fcvt s4, h6 +; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: fmax s1, s4, s1 +; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res } @@ -294,6 +809,64 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmax_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: mov h6, v1.h[2] +; NONEON-NOSVE-NEXT: mov h7, v0.h[2] +; NONEON-NOSVE-NEXT: mov h16, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmax s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fmax s3, s3, s2 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s4 +; NONEON-NOSVE-NEXT: fmax s4, s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[4] +; NONEON-NOSVE-NEXT: mov h7, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fmax s5, s5, s16 +; NONEON-NOSVE-NEXT: mov h16, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt s3, h6 +; NONEON-NOSVE-NEXT: fcvt s6, h7 +; NONEON-NOSVE-NEXT: mov h7, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] +; NONEON-NOSVE-NEXT: mov h4, v1.h[6] +; NONEON-NOSVE-NEXT: fmax s3, s6, s3 +; NONEON-NOSVE-NEXT: mov h6, v0.h[6] +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h6 +; NONEON-NOSVE-NEXT: fmax s6, s16, s7 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] +; NONEON-NOSVE-NEXT: fmax s4, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h3, s6 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] +; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res } @@ -309,6 +882,119 @@ define void @fmax_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fmax z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmax_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: mov h7, v0.h[1] +; NONEON-NOSVE-NEXT: mov h16, v0.h[2] +; NONEON-NOSVE-NEXT: mov h18, v2.h[1] +; NONEON-NOSVE-NEXT: mov h5, v1.h[1] +; NONEON-NOSVE-NEXT: mov h6, v1.h[2] +; NONEON-NOSVE-NEXT: mov h17, v3.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s19, h0 +; NONEON-NOSVE-NEXT: fcvt s20, h3 +; NONEON-NOSVE-NEXT: fcvt s21, h2 +; NONEON-NOSVE-NEXT: mov h22, v3.h[2] +; NONEON-NOSVE-NEXT: mov h23, v2.h[2] +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fmax s4, s19, s4 +; NONEON-NOSVE-NEXT: mov h19, v0.h[3] +; NONEON-NOSVE-NEXT: mov h24, v3.h[3] +; NONEON-NOSVE-NEXT: fmax s20, s21, s20 +; NONEON-NOSVE-NEXT: fcvt s21, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: mov h23, v2.h[3] +; NONEON-NOSVE-NEXT: mov h25, v2.h[6] +; NONEON-NOSVE-NEXT: fmax s5, s7, s5 +; NONEON-NOSVE-NEXT: mov h7, v1.h[3] +; NONEON-NOSVE-NEXT: fmax s6, s16, s6 +; NONEON-NOSVE-NEXT: fmax s16, s18, s17 +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s18, h19 +; NONEON-NOSVE-NEXT: fcvt s19, h24 +; NONEON-NOSVE-NEXT: mov h24, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h17, s5 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt h5, s20 +; NONEON-NOSVE-NEXT: fmax s20, s22, s21 +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt s21, h23 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: mov h22, v0.h[4] +; NONEON-NOSVE-NEXT: mov h23, v2.h[4] +; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] +; NONEON-NOSVE-NEXT: mov h17, v1.h[4] +; NONEON-NOSVE-NEXT: fmax s7, s18, s7 +; NONEON-NOSVE-NEXT: mov h18, v3.h[4] +; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s20 +; NONEON-NOSVE-NEXT: fmax s19, s21, s19 +; NONEON-NOSVE-NEXT: fcvt s20, h23 +; NONEON-NOSVE-NEXT: mov h21, v1.h[5] +; NONEON-NOSVE-NEXT: mov h23, v2.h[5] +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] +; NONEON-NOSVE-NEXT: fcvt s6, h17 +; NONEON-NOSVE-NEXT: fcvt s17, h22 +; NONEON-NOSVE-NEXT: fcvt h7, s7 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: mov h22, v3.h[5] +; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s19 +; NONEON-NOSVE-NEXT: mov h19, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmax s6, s17, s6 +; NONEON-NOSVE-NEXT: mov h17, v1.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fmax s18, s20, s18 +; NONEON-NOSVE-NEXT: mov h20, v3.h[6] +; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] +; NONEON-NOSVE-NEXT: fcvt s7, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt s16, h21 +; NONEON-NOSVE-NEXT: fcvt s21, h24 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fcvt s23, h25 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: fcvt s20, h20 +; NONEON-NOSVE-NEXT: mov h3, v3.h[7] +; NONEON-NOSVE-NEXT: fmax s7, s22, s7 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s16, s21, s16 +; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] +; NONEON-NOSVE-NEXT: fmax s6, s19, s17 +; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] +; NONEON-NOSVE-NEXT: fmax s17, s23, s20 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h7, s7 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fmax s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h3, s17 +; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] +; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] +; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] +; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %res = call <16 x half> @llvm.maximum.v16f16(<16 x half> %op1, <16 x half> %op2) @@ -325,6 +1011,11 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmax_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmax v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res } @@ -338,6 +1029,11 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) { ; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmax_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmax v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res } @@ -353,6 +1049,15 @@ define void @fmax_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fmax z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmax_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmax v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fmax v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = call <8 x float> @llvm.maximum.v8f32(<8 x float> %op1, <8 x float> %op2) @@ -365,6 +1070,11 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) { ; CHECK: // %bb.0: ; CHECK-NEXT: fmax d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmax_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmax d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res } @@ -378,6 +1088,11 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) { ; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmax_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmax v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res } @@ -393,6 +1108,15 @@ define void @fmax_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fmax z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmax_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmax v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fmax v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = call <4 x double> @llvm.maximum.v4f64(<4 x double> %op1, <4 x double> %op2) @@ -413,6 +1137,38 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmin_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: mov h4, v1.h[2] +; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h1 +; NONEON-NOSVE-NEXT: fcvt s7, h0 +; NONEON-NOSVE-NEXT: mov h1, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s2, s3, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h4 +; NONEON-NOSVE-NEXT: fcvt s4, h5 +; NONEON-NOSVE-NEXT: fmin s5, s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v0.h[3] +; NONEON-NOSVE-NEXT: fmin s3, s4, s3 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s5 +; NONEON-NOSVE-NEXT: fcvt s4, h6 +; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: fmin s1, s4, s1 +; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res } @@ -426,6 +1182,64 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmin_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: mov h6, v1.h[2] +; NONEON-NOSVE-NEXT: mov h7, v0.h[2] +; NONEON-NOSVE-NEXT: mov h16, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmin s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fmin s3, s3, s2 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s4 +; NONEON-NOSVE-NEXT: fmin s4, s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[4] +; NONEON-NOSVE-NEXT: mov h7, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fmin s5, s5, s16 +; NONEON-NOSVE-NEXT: mov h16, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt s3, h6 +; NONEON-NOSVE-NEXT: fcvt s6, h7 +; NONEON-NOSVE-NEXT: mov h7, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] +; NONEON-NOSVE-NEXT: mov h4, v1.h[6] +; NONEON-NOSVE-NEXT: fmin s3, s6, s3 +; NONEON-NOSVE-NEXT: mov h6, v0.h[6] +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h6 +; NONEON-NOSVE-NEXT: fmin s6, s16, s7 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] +; NONEON-NOSVE-NEXT: fmin s4, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h3, s6 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] +; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res } @@ -441,6 +1255,119 @@ define void @fmin_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fmin z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmin_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: mov h7, v0.h[1] +; NONEON-NOSVE-NEXT: mov h16, v0.h[2] +; NONEON-NOSVE-NEXT: mov h18, v2.h[1] +; NONEON-NOSVE-NEXT: mov h5, v1.h[1] +; NONEON-NOSVE-NEXT: mov h6, v1.h[2] +; NONEON-NOSVE-NEXT: mov h17, v3.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s19, h0 +; NONEON-NOSVE-NEXT: fcvt s20, h3 +; NONEON-NOSVE-NEXT: fcvt s21, h2 +; NONEON-NOSVE-NEXT: mov h22, v3.h[2] +; NONEON-NOSVE-NEXT: mov h23, v2.h[2] +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fmin s4, s19, s4 +; NONEON-NOSVE-NEXT: mov h19, v0.h[3] +; NONEON-NOSVE-NEXT: mov h24, v3.h[3] +; NONEON-NOSVE-NEXT: fmin s20, s21, s20 +; NONEON-NOSVE-NEXT: fcvt s21, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: mov h23, v2.h[3] +; NONEON-NOSVE-NEXT: mov h25, v2.h[6] +; NONEON-NOSVE-NEXT: fmin s5, s7, s5 +; NONEON-NOSVE-NEXT: mov h7, v1.h[3] +; NONEON-NOSVE-NEXT: fmin s6, s16, s6 +; NONEON-NOSVE-NEXT: fmin s16, s18, s17 +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s18, h19 +; NONEON-NOSVE-NEXT: fcvt s19, h24 +; NONEON-NOSVE-NEXT: mov h24, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h17, s5 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt h5, s20 +; NONEON-NOSVE-NEXT: fmin s20, s22, s21 +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt s21, h23 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: mov h22, v0.h[4] +; NONEON-NOSVE-NEXT: mov h23, v2.h[4] +; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] +; NONEON-NOSVE-NEXT: mov h17, v1.h[4] +; NONEON-NOSVE-NEXT: fmin s7, s18, s7 +; NONEON-NOSVE-NEXT: mov h18, v3.h[4] +; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s20 +; NONEON-NOSVE-NEXT: fmin s19, s21, s19 +; NONEON-NOSVE-NEXT: fcvt s20, h23 +; NONEON-NOSVE-NEXT: mov h21, v1.h[5] +; NONEON-NOSVE-NEXT: mov h23, v2.h[5] +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] +; NONEON-NOSVE-NEXT: fcvt s6, h17 +; NONEON-NOSVE-NEXT: fcvt s17, h22 +; NONEON-NOSVE-NEXT: fcvt h7, s7 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: mov h22, v3.h[5] +; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h16, s19 +; NONEON-NOSVE-NEXT: mov h19, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmin s6, s17, s6 +; NONEON-NOSVE-NEXT: mov h17, v1.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fmin s18, s20, s18 +; NONEON-NOSVE-NEXT: mov h20, v3.h[6] +; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] +; NONEON-NOSVE-NEXT: fcvt s7, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt s16, h21 +; NONEON-NOSVE-NEXT: fcvt s21, h24 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fcvt s23, h25 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: fcvt s20, h20 +; NONEON-NOSVE-NEXT: mov h3, v3.h[7] +; NONEON-NOSVE-NEXT: fmin s7, s22, s7 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s16, s21, s16 +; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] +; NONEON-NOSVE-NEXT: fmin s6, s19, s17 +; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] +; NONEON-NOSVE-NEXT: fmin s17, s23, s20 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h7, s7 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fmin s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h3, s17 +; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] +; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] +; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] +; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] +; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %res = call <16 x half> @llvm.minimum.v16f16(<16 x half> %op1, <16 x half> %op2) @@ -457,6 +1384,11 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmin_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmin v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res } @@ -470,6 +1402,11 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) { ; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmin_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmin v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res } @@ -485,6 +1422,15 @@ define void @fmin_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fmin z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmin_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmin v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fmin v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = call <8 x float> @llvm.minimum.v8f32(<8 x float> %op1, <8 x float> %op2) @@ -497,6 +1443,11 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) { ; CHECK: // %bb.0: ; CHECK-NEXT: fmin d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmin_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmin d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res } @@ -510,6 +1461,11 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) { ; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmin_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmin v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res } @@ -525,6 +1481,15 @@ define void @fmin_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fmin z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmin_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmin v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fmin v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = call <4 x double> @llvm.minimum.v4f64(<4 x double> %op1, <4 x double> %op2) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll index e239ff5e35fd36..f1561011e21812 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64 +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -26,6 +27,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) { ; NO-FA64-NEXT: fadd h0, h0, h2 ; NO-FA64-NEXT: fadd h0, h0, h1 ; NO-FA64-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: mov h1, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll index 78ae7bb6cf30ab..a0a7dad835662e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -19,6 +20,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) { ; CHECK-NEXT: fadd h0, h0, h2 ; CHECK-NEXT: fadd h0, h0, h1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: mov h1, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res } @@ -43,6 +68,49 @@ define half @fadda_v8f16(half %start, <8 x half> %a) { ; CHECK-NEXT: fadd h0, h0, h2 ; CHECK-NEXT: fadd h0, h0, h1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) ret half %res } @@ -83,6 +151,90 @@ define half @fadda_v16f16(half %start, ptr %a) { ; CHECK-NEXT: fadd h0, h0, h2 ; CHECK-NEXT: fadd h0, h0, h1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) ret half %res @@ -96,6 +248,14 @@ define float @fadda_v2f32(float %start, <2 x float> %a) { ; CHECK-NEXT: mov z1.s, z1.s[1] ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: mov s2, v1.s[1] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) ret float %res } @@ -112,6 +272,17 @@ define float @fadda_v4f32(float %start, <4 x float> %a) { ; CHECK-NEXT: fadd s0, s0, s2 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov s2, v1.s[1] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: mov s3, v1.s[2] +; NONEON-NOSVE-NEXT: mov s1, v1.s[3] +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s3 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) ret float %res } @@ -136,6 +307,26 @@ define float @fadda_v8f32(float %start, ptr %a) { ; CHECK-NEXT: fadd s0, s0, s2 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: mov s2, v1.s[1] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: mov s3, v1.s[2] +; NONEON-NOSVE-NEXT: mov s1, v1.s[3] +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s3 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: mov s2, v1.s[1] +; NONEON-NOSVE-NEXT: mov s3, v1.s[2] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: mov s1, v1.s[3] +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s3 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) ret float %res @@ -146,6 +337,11 @@ define double @fadda_v1f64(double %start, <1 x double> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a) ret double %res } @@ -158,6 +354,13 @@ define double @fadda_v2f64(double %start, <2 x double> %a) { ; CHECK-NEXT: mov z1.d, z1.d[1] ; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov d2, v1.d[1] +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: fadd d0, d0, d2 +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) ret double %res } @@ -174,6 +377,17 @@ define double @fadda_v4f64(double %start, ptr %a) { ; CHECK-NEXT: mov z1.d, z1.d[1] ; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadda_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q3, q1, [x0] +; NONEON-NOSVE-NEXT: mov d2, v3.d[1] +; NONEON-NOSVE-NEXT: fadd d0, d0, d3 +; NONEON-NOSVE-NEXT: fadd d0, d0, d2 +; NONEON-NOSVE-NEXT: mov d2, v1.d[1] +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: fadd d0, d0, d2 +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) ret double %res @@ -191,6 +405,30 @@ define half @faddv_v4f16(half %start, <4 x half> %a) { ; CHECK-NEXT: faddv h1, p0, z1.h ; CHECK-NEXT: fadd h0, h0, h1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: faddv_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s3, s2 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: mov h1, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res } @@ -203,6 +441,49 @@ define half @faddv_v8f16(half %start, <8 x half> %a) { ; CHECK-NEXT: faddv h1, p0, z1.h ; CHECK-NEXT: fadd h0, h0, h1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: faddv_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s3, s2 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s2, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s2, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s2, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s2, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[6] +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) ret half %res } @@ -216,6 +497,58 @@ define half @faddv_v16f16(half %start, ptr %a) { ; CHECK-NEXT: faddv h1, p0, z1.h ; CHECK-NEXT: fadd h0, h0, h1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: faddv_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl v4.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fadd v3.4s, v4.4s, v3.4s +; NONEON-NOSVE-NEXT: fadd v1.4s, v1.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v1.4s +; NONEON-NOSVE-NEXT: mov h1, v2.h[1] +; NONEON-NOSVE-NEXT: fcvt s3, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s3, s1 +; NONEON-NOSVE-NEXT: mov h3, v2.h[2] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s3 +; NONEON-NOSVE-NEXT: mov h3, v2.h[3] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s3 +; NONEON-NOSVE-NEXT: mov h3, v2.h[4] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s3 +; NONEON-NOSVE-NEXT: mov h3, v2.h[5] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s3 +; NONEON-NOSVE-NEXT: mov h3, v2.h[6] +; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) ret half %res @@ -229,6 +562,12 @@ define float @faddv_v2f32(float %start, <2 x float> %a) { ; CHECK-NEXT: faddv s1, p0, z1.s ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: faddv_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: faddp s1, v1.2s +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ret %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) ret float %res } @@ -241,6 +580,13 @@ define float @faddv_v4f32(float %start, <4 x float> %a) { ; CHECK-NEXT: faddv s1, p0, z1.s ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: faddv_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: faddp v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: faddp s1, v1.2s +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ret %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) ret float %res } @@ -254,6 +600,15 @@ define float @faddv_v8f32(float %start, ptr %a) { ; CHECK-NEXT: faddv s1, p0, z1.s ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: faddv_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: fadd v1.4s, v2.4s, v1.4s +; NONEON-NOSVE-NEXT: faddp v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: faddp s1, v1.2s +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) ret float %res @@ -264,6 +619,11 @@ define double @faddv_v1f64(double %start, <1 x double> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: faddv_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a) ret double %res } @@ -276,6 +636,12 @@ define double @faddv_v2f64(double %start, <2 x double> %a) { ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: faddv_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: faddp d1, v1.2d +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) ret double %res } @@ -289,6 +655,14 @@ define double @faddv_v4f64(double %start, ptr %a) { ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: faddv_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: fadd v1.2d, v2.2d, v1.2d +; NONEON-NOSVE-NEXT: faddp d1, v1.2d +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) ret double %res @@ -306,6 +680,26 @@ define half @fmaxv_v4f16(<4 x half> %a) { ; CHECK-NEXT: fmaxnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxv_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s2, s1 +; NONEON-NOSVE-NEXT: mov h2, v0.h[2] +; NONEON-NOSVE-NEXT: mov h0, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a) ret half %res } @@ -318,6 +712,45 @@ define half @fmaxv_v8f16(<8 x half> %a) { ; CHECK-NEXT: fmaxnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxv_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s2, s1 +; NONEON-NOSVE-NEXT: mov h2, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a) ret half %res } @@ -331,6 +764,85 @@ define half @fmaxv_v16f16(ptr %a) { ; CHECK-NEXT: fmaxnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxv_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmaxnm s3, s5, s3 +; NONEON-NOSVE-NEXT: mov h5, v0.h[3] +; NONEON-NOSVE-NEXT: fmaxnm s2, s4, s2 +; NONEON-NOSVE-NEXT: mov h4, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[4] +; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmaxnm s3, s5, s3 +; NONEON-NOSVE-NEXT: mov h5, v0.h[5] +; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fmaxnm s3, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op) ret half %res @@ -344,6 +856,11 @@ define float @fmaxv_v2f32(<2 x float> %a) { ; CHECK-NEXT: fmaxnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxv_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxnmp s0, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a) ret float %res } @@ -356,6 +873,11 @@ define float @fmaxv_v4f32(<4 x float> %a) { ; CHECK-NEXT: fmaxnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxv_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxnmv s0, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) ret float %res } @@ -369,6 +891,13 @@ define float @fmaxv_v8f32(ptr %a) { ; CHECK-NEXT: fmaxnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxv_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: fmaxnm v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fmaxnmv s0, v0.4s +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op) ret float %res @@ -378,6 +907,10 @@ define double @fmaxv_v1f64(<1 x double> %a) { ; CHECK-LABEL: fmaxv_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxv_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a) ret double %res } @@ -390,6 +923,11 @@ define double @fmaxv_v2f64(<2 x double> %a) { ; CHECK-NEXT: fmaxnmv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxv_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxnmp d0, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) ret double %res } @@ -403,6 +941,13 @@ define double @fmaxv_v4f64(ptr %a) { ; CHECK-NEXT: fmaxnmv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaxv_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: fmaxnm v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fmaxnmp d0, v0.2d +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op) ret double %res @@ -420,6 +965,26 @@ define half @fminv_v4f16(<4 x half> %a) { ; CHECK-NEXT: fminnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminv_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s1, s2, s1 +; NONEON-NOSVE-NEXT: mov h2, v0.h[2] +; NONEON-NOSVE-NEXT: mov h0, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a) ret half %res } @@ -432,6 +997,45 @@ define half @fminv_v8f16(<8 x half> %a) { ; CHECK-NEXT: fminnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminv_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s1, s2, s1 +; NONEON-NOSVE-NEXT: mov h2, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a) ret half %res } @@ -445,6 +1049,85 @@ define half @fminv_v16f16(ptr %a) { ; CHECK-NEXT: fminnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminv_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fminnm s3, s5, s3 +; NONEON-NOSVE-NEXT: mov h5, v0.h[3] +; NONEON-NOSVE-NEXT: fminnm s2, s4, s2 +; NONEON-NOSVE-NEXT: mov h4, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[4] +; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fminnm s3, s5, s3 +; NONEON-NOSVE-NEXT: mov h5, v0.h[5] +; NONEON-NOSVE-NEXT: fminnm s2, s2, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fminnm s3, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op) ret half %res @@ -458,6 +1141,11 @@ define float @fminv_v2f32(<2 x float> %a) { ; CHECK-NEXT: fminnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminv_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminnmp s0, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a) ret float %res } @@ -470,6 +1158,11 @@ define float @fminv_v4f32(<4 x float> %a) { ; CHECK-NEXT: fminnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminv_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminnmv s0, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) ret float %res } @@ -483,6 +1176,13 @@ define float @fminv_v8f32(ptr %a) { ; CHECK-NEXT: fminnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminv_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: fminnm v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fminnmv s0, v0.4s +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op) ret float %res @@ -492,6 +1192,10 @@ define double @fminv_v1f64(<1 x double> %a) { ; CHECK-LABEL: fminv_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminv_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a) ret double %res } @@ -504,6 +1208,11 @@ define double @fminv_v2f64(<2 x double> %a) { ; CHECK-NEXT: fminnmv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminv_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminnmp d0, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) ret double %res } @@ -517,6 +1226,13 @@ define double @fminv_v4f64(ptr %a) { ; CHECK-NEXT: fminnmv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminv_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: fminnm v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fminnmp d0, v0.2d +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op) ret double %res @@ -534,6 +1250,26 @@ define half @fmaximumv_v4f16(<4 x half> %a) { ; CHECK-NEXT: fmaxv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaximumv_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s1, s2, s1 +; NONEON-NOSVE-NEXT: mov h2, v0.h[2] +; NONEON-NOSVE-NEXT: mov h0, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a) ret half %res } @@ -546,6 +1282,45 @@ define half @fmaximumv_v8f16(<8 x half> %a) { ; CHECK-NEXT: fmaxv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaximumv_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s1, s2, s1 +; NONEON-NOSVE-NEXT: mov h2, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a) ret half %res } @@ -559,6 +1334,85 @@ define half @fmaximumv_v16f16(ptr %a) { ; CHECK-NEXT: fmaxv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaximumv_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmax s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fmax s2, s3, s2 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmax s3, s5, s3 +; NONEON-NOSVE-NEXT: mov h5, v0.h[3] +; NONEON-NOSVE-NEXT: fmax s2, s4, s2 +; NONEON-NOSVE-NEXT: mov h4, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmax s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[4] +; NONEON-NOSVE-NEXT: fmax s2, s2, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmax s3, s5, s3 +; NONEON-NOSVE-NEXT: mov h5, v0.h[5] +; NONEON-NOSVE-NEXT: fmax s2, s2, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmax s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fmax s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fmax s2, s2, s3 +; NONEON-NOSVE-NEXT: fmax s3, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmax s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op) ret half %res @@ -572,6 +1426,11 @@ define float @fmaximumv_v2f32(<2 x float> %a) { ; CHECK-NEXT: fmaxv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaximumv_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxp s0, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a) ret float %res } @@ -584,6 +1443,11 @@ define float @fmaximumv_v4f32(<4 x float> %a) { ; CHECK-NEXT: fmaxv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaximumv_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxv s0, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a) ret float %res } @@ -597,6 +1461,13 @@ define float @fmaximumv_v8f32(ptr %a) { ; CHECK-NEXT: fmaxv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaximumv_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: fmax v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fmaxv s0, v0.4s +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op) ret float %res @@ -606,6 +1477,10 @@ define double @fmaximumv_v1f64(<1 x double> %a) { ; CHECK-LABEL: fmaximumv_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaximumv_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a) ret double %res } @@ -618,6 +1493,11 @@ define double @fmaximumv_v2f64(<2 x double> %a) { ; CHECK-NEXT: fmaxv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaximumv_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmaxp d0, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a) ret double %res } @@ -631,6 +1511,13 @@ define double @fmaximumv_v4f64(ptr %a) { ; CHECK-NEXT: fmaxv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fmaximumv_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: fmax v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fmaxp d0, v0.2d +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op) ret double %res @@ -648,6 +1535,26 @@ define half @fminimumv_v4f16(<4 x half> %a) { ; CHECK-NEXT: fminv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminimumv_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s1, s2, s1 +; NONEON-NOSVE-NEXT: mov h2, v0.h[2] +; NONEON-NOSVE-NEXT: mov h0, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a) ret half %res } @@ -660,6 +1567,45 @@ define half @fminimumv_v8f16(<8 x half> %a) { ; CHECK-NEXT: fminv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminimumv_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s1, s2, s1 +; NONEON-NOSVE-NEXT: mov h2, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s1, s1, s2 +; NONEON-NOSVE-NEXT: mov h2, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a) ret half %res } @@ -673,6 +1619,85 @@ define half @fminimumv_v16f16(ptr %a) { ; CHECK-NEXT: fminv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminimumv_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s4, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmin s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fmin s2, s3, s2 +; NONEON-NOSVE-NEXT: mov h3, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmin s3, s5, s3 +; NONEON-NOSVE-NEXT: mov h5, v0.h[3] +; NONEON-NOSVE-NEXT: fmin s2, s4, s2 +; NONEON-NOSVE-NEXT: mov h4, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmin s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[4] +; NONEON-NOSVE-NEXT: fmin s2, s2, s3 +; NONEON-NOSVE-NEXT: mov h3, v1.h[4] +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmin s3, s5, s3 +; NONEON-NOSVE-NEXT: mov h5, v0.h[5] +; NONEON-NOSVE-NEXT: fmin s2, s2, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[5] +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmin s4, s5, s4 +; NONEON-NOSVE-NEXT: mov h5, v0.h[6] +; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: fmin s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h3, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[6] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fmin s2, s2, s3 +; NONEON-NOSVE-NEXT: fmin s3, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmin s2, s2, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op) ret half %res @@ -686,6 +1711,11 @@ define float @fminimumv_v2f32(<2 x float> %a) { ; CHECK-NEXT: fminv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminimumv_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminp s0, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a) ret float %res } @@ -698,6 +1728,11 @@ define float @fminimumv_v4f32(<4 x float> %a) { ; CHECK-NEXT: fminv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminimumv_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminv s0, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a) ret float %res } @@ -711,6 +1746,13 @@ define float @fminimumv_v8f32(ptr %a) { ; CHECK-NEXT: fminv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminimumv_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: fmin v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fminv s0, v0.4s +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op) ret float %res @@ -720,6 +1762,10 @@ define double @fminimumv_v1f64(<1 x double> %a) { ; CHECK-LABEL: fminimumv_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminimumv_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fminimum.v1f64(<1 x double> %a) ret double %res } @@ -732,6 +1778,11 @@ define double @fminimumv_v2f64(<2 x double> %a) { ; CHECK-NEXT: fminv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminimumv_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fminp d0, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a) ret double %res } @@ -745,6 +1796,13 @@ define double @fminimumv_v4f64(ptr %a) { ; CHECK-NEXT: fminv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fminimumv_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: fmin v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fminp d0, v0.2d +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op) ret double %res diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll index 412c27cb82f1d4..6af2b885ace08f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -16,6 +17,13 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) { ; CHECK-NEXT: frintp z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.ceil.v2f16(<2 x half> %op) ret <2 x half> %res } @@ -28,6 +36,13 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) { ; CHECK-NEXT: frintp z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op) ret <4 x half> %res } @@ -40,6 +55,16 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) { ; CHECK-NEXT: frintp z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h +; NONEON-NOSVE-NEXT: frintp v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s +; NONEON-NOSVE-NEXT: frintp v1.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op) ret <8 x half> %res } @@ -53,6 +78,24 @@ define void @frintp_v16f16(ptr %a) { ; CHECK-NEXT: frintp z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: frintp v2.4s, v2.4s +; NONEON-NOSVE-NEXT: frintp v3.4s, v3.4s +; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintp v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op) store <16 x half> %res, ptr %a @@ -67,6 +110,11 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) { ; CHECK-NEXT: frintp z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintp v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op) ret <2 x float> %res } @@ -79,6 +127,11 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) { ; CHECK-NEXT: frintp z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op) ret <4 x float> %res } @@ -92,6 +145,14 @@ define void @frintp_v8f32(ptr %a) { ; CHECK-NEXT: frintp z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintp v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op) store <8 x float> %res, ptr %a @@ -103,6 +164,11 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) { ; CHECK: // %bb.0: ; CHECK-NEXT: frintp d0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintp d0, d0 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op) ret <1 x double> %res } @@ -115,6 +181,11 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) { ; CHECK-NEXT: frintp z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintp v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op) ret <2 x double> %res } @@ -128,6 +199,14 @@ define void @frintp_v4f64(ptr %a) { ; CHECK-NEXT: frintp z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintp_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintp v0.2d, v0.2d +; NONEON-NOSVE-NEXT: frintp v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op) store <4 x double> %res, ptr %a @@ -146,6 +225,13 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) { ; CHECK-NEXT: frintm z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.floor.v2f16(<2 x half> %op) ret <2 x half> %res } @@ -158,6 +244,13 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) { ; CHECK-NEXT: frintm z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op) ret <4 x half> %res } @@ -170,6 +263,16 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) { ; CHECK-NEXT: frintm z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h +; NONEON-NOSVE-NEXT: frintm v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s +; NONEON-NOSVE-NEXT: frintm v1.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op) ret <8 x half> %res } @@ -183,6 +286,24 @@ define void @frintm_v16f16(ptr %a) { ; CHECK-NEXT: frintm z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: frintm v2.4s, v2.4s +; NONEON-NOSVE-NEXT: frintm v3.4s, v3.4s +; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintm v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op) store <16 x half> %res, ptr %a @@ -197,6 +318,11 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) { ; CHECK-NEXT: frintm z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintm v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op) ret <2 x float> %res } @@ -209,6 +335,11 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) { ; CHECK-NEXT: frintm z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op) ret <4 x float> %res } @@ -222,6 +353,14 @@ define void @frintm_v8f32(ptr %a) { ; CHECK-NEXT: frintm z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintm v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op) store <8 x float> %res, ptr %a @@ -233,6 +372,11 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) { ; CHECK: // %bb.0: ; CHECK-NEXT: frintm d0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintm d0, d0 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op) ret <1 x double> %res } @@ -245,6 +389,11 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) { ; CHECK-NEXT: frintm z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintm v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op) ret <2 x double> %res } @@ -258,6 +407,14 @@ define void @frintm_v4f64(ptr %a) { ; CHECK-NEXT: frintm z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintm_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintm v0.2d, v0.2d +; NONEON-NOSVE-NEXT: frintm v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op) store <4 x double> %res, ptr %a @@ -276,6 +433,13 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) { ; CHECK-NEXT: frinti z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.nearbyint.v2f16(<2 x half> %op) ret <2 x half> %res } @@ -288,6 +452,13 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) { ; CHECK-NEXT: frinti z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op) ret <4 x half> %res } @@ -300,6 +471,16 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) { ; CHECK-NEXT: frinti z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h +; NONEON-NOSVE-NEXT: frinti v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s +; NONEON-NOSVE-NEXT: frinti v1.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op) ret <8 x half> %res } @@ -313,6 +494,24 @@ define void @frinti_v16f16(ptr %a) { ; CHECK-NEXT: frinti z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: frinti v2.4s, v2.4s +; NONEON-NOSVE-NEXT: frinti v3.4s, v3.4s +; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frinti v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op) store <16 x half> %res, ptr %a @@ -327,6 +526,11 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) { ; CHECK-NEXT: frinti z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frinti v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op) ret <2 x float> %res } @@ -339,6 +543,11 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) { ; CHECK-NEXT: frinti z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op) ret <4 x float> %res } @@ -352,6 +561,14 @@ define void @frinti_v8f32(ptr %a) { ; CHECK-NEXT: frinti z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frinti v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op) store <8 x float> %res, ptr %a @@ -363,6 +580,11 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) { ; CHECK: // %bb.0: ; CHECK-NEXT: frinti d0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frinti d0, d0 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op) ret <1 x double> %res } @@ -375,6 +597,11 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) { ; CHECK-NEXT: frinti z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frinti v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op) ret <2 x double> %res } @@ -388,6 +615,14 @@ define void @frinti_v4f64(ptr %a) { ; CHECK-NEXT: frinti z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinti_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frinti v0.2d, v0.2d +; NONEON-NOSVE-NEXT: frinti v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op) store <4 x double> %res, ptr %a @@ -406,6 +641,13 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) { ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.rint.v2f16(<2 x half> %op) ret <2 x half> %res } @@ -418,6 +660,13 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) { ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op) ret <4 x half> %res } @@ -430,6 +679,16 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) { ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h +; NONEON-NOSVE-NEXT: frintx v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s +; NONEON-NOSVE-NEXT: frintx v1.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op) ret <8 x half> %res } @@ -443,6 +702,24 @@ define void @frintx_v16f16(ptr %a) { ; CHECK-NEXT: frintx z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: frintx v2.4s, v2.4s +; NONEON-NOSVE-NEXT: frintx v3.4s, v3.4s +; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintx v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op) store <16 x half> %res, ptr %a @@ -457,6 +734,11 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) { ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintx v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op) ret <2 x float> %res } @@ -469,6 +751,11 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) { ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op) ret <4 x float> %res } @@ -482,6 +769,14 @@ define void @frintx_v8f32(ptr %a) { ; CHECK-NEXT: frintx z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintx v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op) store <8 x float> %res, ptr %a @@ -493,6 +788,11 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) { ; CHECK: // %bb.0: ; CHECK-NEXT: frintx d0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintx d0, d0 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op) ret <1 x double> %res } @@ -505,6 +805,11 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) { ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintx v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op) ret <2 x double> %res } @@ -518,6 +823,14 @@ define void @frintx_v4f64(ptr %a) { ; CHECK-NEXT: frintx z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintx_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintx v0.2d, v0.2d +; NONEON-NOSVE-NEXT: frintx v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op) store <4 x double> %res, ptr %a @@ -536,6 +849,13 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) { ; CHECK-NEXT: frinta z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.round.v2f16(<2 x half> %op) ret <2 x half> %res } @@ -548,6 +868,13 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) { ; CHECK-NEXT: frinta z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op) ret <4 x half> %res } @@ -560,6 +887,16 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) { ; CHECK-NEXT: frinta z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h +; NONEON-NOSVE-NEXT: frinta v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s +; NONEON-NOSVE-NEXT: frinta v1.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op) ret <8 x half> %res } @@ -573,6 +910,24 @@ define void @frinta_v16f16(ptr %a) { ; CHECK-NEXT: frinta z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: frinta v2.4s, v2.4s +; NONEON-NOSVE-NEXT: frinta v3.4s, v3.4s +; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frinta v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op) store <16 x half> %res, ptr %a @@ -587,6 +942,11 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) { ; CHECK-NEXT: frinta z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frinta v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op) ret <2 x float> %res } @@ -599,6 +959,11 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) { ; CHECK-NEXT: frinta z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op) ret <4 x float> %res } @@ -612,6 +977,14 @@ define void @frinta_v8f32(ptr %a) { ; CHECK-NEXT: frinta z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frinta v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op) store <8 x float> %res, ptr %a @@ -623,6 +996,11 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) { ; CHECK: // %bb.0: ; CHECK-NEXT: frinta d0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frinta d0, d0 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op) ret <1 x double> %res } @@ -635,6 +1013,11 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) { ; CHECK-NEXT: frinta z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frinta v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op) ret <2 x double> %res } @@ -648,6 +1031,14 @@ define void @frinta_v4f64(ptr %a) { ; CHECK-NEXT: frinta z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frinta_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frinta v0.2d, v0.2d +; NONEON-NOSVE-NEXT: frinta v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op) store <4 x double> %res, ptr %a @@ -666,6 +1057,13 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) { ; CHECK-NEXT: frintn z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %op) ret <2 x half> %res } @@ -678,6 +1076,13 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) { ; CHECK-NEXT: frintn z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op) ret <4 x half> %res } @@ -690,6 +1095,16 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) { ; CHECK-NEXT: frintn z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h +; NONEON-NOSVE-NEXT: frintn v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s +; NONEON-NOSVE-NEXT: frintn v1.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op) ret <8 x half> %res } @@ -703,6 +1118,24 @@ define void @frintn_v16f16(ptr %a) { ; CHECK-NEXT: frintn z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: frintn v2.4s, v2.4s +; NONEON-NOSVE-NEXT: frintn v3.4s, v3.4s +; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintn v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op) store <16 x half> %res, ptr %a @@ -717,6 +1150,11 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) { ; CHECK-NEXT: frintn z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintn v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op) ret <2 x float> %res } @@ -729,6 +1167,11 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) { ; CHECK-NEXT: frintn z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op) ret <4 x float> %res } @@ -742,6 +1185,14 @@ define void @frintn_v8f32(ptr %a) { ; CHECK-NEXT: frintn z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintn v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op) store <8 x float> %res, ptr %a @@ -753,6 +1204,11 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) { ; CHECK: // %bb.0: ; CHECK-NEXT: frintn d0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintn d0, d0 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op) ret <1 x double> %res } @@ -765,6 +1221,11 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) { ; CHECK-NEXT: frintn z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintn v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op) ret <2 x double> %res } @@ -778,6 +1239,14 @@ define void @frintn_v4f64(ptr %a) { ; CHECK-NEXT: frintn z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintn_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintn v0.2d, v0.2d +; NONEON-NOSVE-NEXT: frintn v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op) store <4 x double> %res, ptr %a @@ -796,6 +1265,13 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) { ; CHECK-NEXT: frintz z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.trunc.v2f16(<2 x half> %op) ret <2 x half> %res } @@ -808,6 +1284,13 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) { ; CHECK-NEXT: frintz z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op) ret <4 x half> %res } @@ -820,6 +1303,16 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) { ; CHECK-NEXT: frintz z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h +; NONEON-NOSVE-NEXT: frintz v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s +; NONEON-NOSVE-NEXT: frintz v1.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op) ret <8 x half> %res } @@ -833,6 +1326,24 @@ define void @frintz_v16f16(ptr %a) { ; CHECK-NEXT: frintz z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: frintz v2.4s, v2.4s +; NONEON-NOSVE-NEXT: frintz v3.4s, v3.4s +; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintz v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op) store <16 x half> %res, ptr %a @@ -847,6 +1358,11 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) { ; CHECK-NEXT: frintz z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintz v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op) ret <2 x float> %res } @@ -859,6 +1375,11 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) { ; CHECK-NEXT: frintz z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op) ret <4 x float> %res } @@ -872,6 +1393,14 @@ define void @frintz_v8f32(ptr %a) { ; CHECK-NEXT: frintz z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: frintz v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op) store <8 x float> %res, ptr %a @@ -883,6 +1412,11 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) { ; CHECK: // %bb.0: ; CHECK-NEXT: frintz d0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op) ret <1 x double> %res } @@ -895,6 +1429,11 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) { ; CHECK-NEXT: frintz z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: frintz v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op) ret <2 x double> %res } @@ -908,6 +1447,14 @@ define void @frintz_v4f64(ptr %a) { ; CHECK-NEXT: frintz z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: frintz_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: frintz v0.2d, v0.2d +; NONEON-NOSVE-NEXT: frintz v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op) store <4 x double> %res, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll index 89697cde848b53..824419b31a5a83 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -16,6 +17,14 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.4h, w8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2 ret <2 x half> %sel } @@ -32,6 +41,14 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.4h, w8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2 ret <4 x half> %sel } @@ -48,6 +65,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.8h, w8 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2 ret <8 x half> %sel } @@ -67,6 +92,20 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) { ; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w2, #0x1 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load volatile <16 x half>, ptr %a %op2 = load volatile <16 x half>, ptr %b %sel = select i1 %mask, <16 x half> %op1, <16 x half> %op2 @@ -86,6 +125,14 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.2s, w8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2 ret <2 x float> %sel } @@ -102,6 +149,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.4s, w8 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2 ret <4 x float> %sel } @@ -121,6 +176,20 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) { ; CHECK-NEXT: sel z1.s, p0, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w2, #0x1 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load volatile <8 x float>, ptr %a %op2 = load volatile <8 x float>, ptr %b %sel = select i1 %mask, <8 x float> %op1, <8 x float> %op2 @@ -134,6 +203,14 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: fcsel d0, d0, d1, ne ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm x8, ne +; NONEON-NOSVE-NEXT: fmov d2, x8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2 ret <1 x double> %sel } @@ -151,6 +228,14 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask ; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm x8, ne +; NONEON-NOSVE-NEXT: dup v2.2d, x8 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2 ret <2 x double> %sel } @@ -171,6 +256,20 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) { ; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w2, #0x1 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: csetm x8, ne +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load volatile <4 x double>, ptr %a %op2 = load volatile <4 x double>, ptr %b %sel = select i1 %mask, <4 x double> %op1, <4 x double> %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index 5840ffb20994ce..c853bdc5af8db0 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -15,6 +16,13 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) { ; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x half> %op1 to <4 x i16> ret <4 x i16> %res } @@ -27,6 +35,21 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptoui <8 x half> %op1 to <8 x i16> store <8 x i16> %res, ptr %b @@ -42,6 +65,27 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzu z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzu v2.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtzu v3.4s, v3.4s +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i16> store <16 x i16> %res, ptr %b @@ -61,6 +105,13 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) { ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x half> %op1 to <2 x i32> ret <2 x i32> %res } @@ -74,6 +125,12 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) { ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x half> %op1 to <4 x i32> ret <4 x i32> %res } @@ -90,6 +147,20 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptoui <8 x half> %op1 to <8 x i32> store <8 x i32> %res, ptr %b @@ -114,6 +185,26 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzu v2.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtzu v3.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i32> store <16 x i32> %res, ptr %b @@ -130,6 +221,13 @@ define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) { ; CHECK-NEXT: fcvtzu x8, h0 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x half> %op1 to <1 x i64> ret <1 x i64> %res } @@ -145,6 +243,18 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x half> %op1 to <2 x i64> ret <2 x i64> %res } @@ -167,6 +277,27 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: mov h1, v0.h[2] +; NONEON-NOSVE-NEXT: mov h2, v0.h[3] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s1 +; NONEON-NOSVE-NEXT: fcvtzu x10, s2 +; NONEON-NOSVE-NEXT: fcvtzu x11, s3 +; NONEON-NOSVE-NEXT: fmov d1, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fptoui <4 x half> %op1 to <4 x i64> store <4 x i64> %res, ptr %b @@ -204,6 +335,47 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: mov h1, v0.h[2] +; NONEON-NOSVE-NEXT: mov h3, v0.h[3] +; NONEON-NOSVE-NEXT: mov h4, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov h5, v2.h[2] +; NONEON-NOSVE-NEXT: mov h6, v2.h[3] +; NONEON-NOSVE-NEXT: mov h7, v2.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvtzu x13, s2 +; NONEON-NOSVE-NEXT: fcvtzu x8, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h7 +; NONEON-NOSVE-NEXT: fcvtzu x10, s3 +; NONEON-NOSVE-NEXT: fcvtzu x11, s4 +; NONEON-NOSVE-NEXT: fcvtzu x12, s5 +; NONEON-NOSVE-NEXT: fcvtzu x14, s6 +; NONEON-NOSVE-NEXT: fmov d3, x13 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: fcvtzu x8, s1 +; NONEON-NOSVE-NEXT: fmov d1, x9 +; NONEON-NOSVE-NEXT: fmov d2, x12 +; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: mov v3.d[1], x8 +; NONEON-NOSVE-NEXT: mov v2.d[1], x14 +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptoui <8 x half> %op1 to <8 x i64> store <8 x i64> %res, ptr %b @@ -264,6 +436,80 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q5, q2, [x1, #96] ; CHECK-NEXT: add sp, sp, #128 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: mov h7, v0.h[2] +; NONEON-NOSVE-NEXT: mov h16, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s6, h0 +; NONEON-NOSVE-NEXT: mov h0, v0.h[1] +; NONEON-NOSVE-NEXT: mov h1, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s17, h4 +; NONEON-NOSVE-NEXT: mov h18, v4.h[2] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvtzu x8, s3 +; NONEON-NOSVE-NEXT: fcvt s3, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h7 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: mov h16, v4.h[3] +; NONEON-NOSVE-NEXT: fcvtzu x9, s6 +; NONEON-NOSVE-NEXT: ldr d6, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: mov h4, v4.h[1] +; NONEON-NOSVE-NEXT: fcvtzu x11, s2 +; NONEON-NOSVE-NEXT: mov h2, v6.h[2] +; NONEON-NOSVE-NEXT: fcvtzu x10, s17 +; NONEON-NOSVE-NEXT: fcvtzu x13, s5 +; NONEON-NOSVE-NEXT: fcvtzu x12, s3 +; NONEON-NOSVE-NEXT: mov h3, v6.h[3] +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: mov h5, v6.h[1] +; NONEON-NOSVE-NEXT: fcvt s17, h18 +; NONEON-NOSVE-NEXT: fcvtzu x14, s7 +; NONEON-NOSVE-NEXT: fmov d7, x8 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fmov d0, x11 +; NONEON-NOSVE-NEXT: fcvtzu x11, s1 +; NONEON-NOSVE-NEXT: fmov d1, x13 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvtzu x13, s16 +; NONEON-NOSVE-NEXT: fmov d16, x9 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvtzu x15, s17 +; NONEON-NOSVE-NEXT: mov v0.d[1], x12 +; NONEON-NOSVE-NEXT: mov v1.d[1], x14 +; NONEON-NOSVE-NEXT: fcvtzu x9, s2 +; NONEON-NOSVE-NEXT: mov v16.d[1], x8 +; NONEON-NOSVE-NEXT: fcvtzu x8, s6 +; NONEON-NOSVE-NEXT: fcvtzu x14, s4 +; NONEON-NOSVE-NEXT: fcvtzu x12, s3 +; NONEON-NOSVE-NEXT: mov v7.d[1], x11 +; NONEON-NOSVE-NEXT: fmov d3, x10 +; NONEON-NOSVE-NEXT: fcvtzu x11, s5 +; NONEON-NOSVE-NEXT: fmov d2, x15 +; NONEON-NOSVE-NEXT: stp q16, q1, [x1, #64] +; NONEON-NOSVE-NEXT: fmov d1, x9 +; NONEON-NOSVE-NEXT: fmov d4, x8 +; NONEON-NOSVE-NEXT: stp q7, q0, [x1] +; NONEON-NOSVE-NEXT: mov v2.d[1], x13 +; NONEON-NOSVE-NEXT: mov v3.d[1], x14 +; NONEON-NOSVE-NEXT: mov v1.d[1], x12 +; NONEON-NOSVE-NEXT: mov v4.d[1], x11 +; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #96] +; NONEON-NOSVE-NEXT: stp q4, q1, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i64> store <16 x i64> %res, ptr %b @@ -282,6 +528,11 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) { ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i16> ret <2 x i16> %res } @@ -295,6 +546,12 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x float> %op1 to <4 x i16> ret <4 x i16> %res } @@ -312,6 +569,14 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i16> ret <8 x i16> %res @@ -336,6 +601,19 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q2, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v16f32_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzu v3.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtzu v2.4s, v2.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x float>, ptr %a %res = fptoui <16 x float> %op1 to <16 x i16> store <16 x i16> %res, ptr %b @@ -354,6 +632,11 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) { ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzu v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i32> ret <2 x i32> %res } @@ -366,6 +649,11 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) { ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x float> %op1 to <4 x i32> ret <4 x i32> %res } @@ -379,6 +667,14 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i32> store <8 x i32> %res, ptr %b @@ -398,6 +694,13 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) { ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v1f32_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x float> %op1 to <1 x i64> ret <1 x i64> %res } @@ -411,6 +714,12 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) { ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i64> ret <2 x i64> %res } @@ -427,6 +736,20 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fptoui <4 x float> %op1 to <4 x i64> store <4 x i64> %res, ptr %b @@ -451,6 +774,26 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s +; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s +; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzu v2.2d, v2.2d +; NONEON-NOSVE-NEXT: fcvtzu v3.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i64> store <8 x i64> %res, ptr %b @@ -468,6 +811,12 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) { ; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i16> ret <1 x i16> %res } @@ -481,6 +830,12 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i16> ret <2 x i16> %res } @@ -509,6 +864,15 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i16> ret <4 x i16> %res @@ -552,6 +916,23 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) { ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: adrp x8, .LCPI26_0 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d +; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d +; NONEON-NOSVE-NEXT: xtn v7.2s, v0.2d +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI26_0] +; NONEON-NOSVE-NEXT: xtn v6.2s, v1.2d +; NONEON-NOSVE-NEXT: xtn v5.2s, v2.2d +; NONEON-NOSVE-NEXT: xtn v4.2s, v3.2d +; NONEON-NOSVE-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptoui <8 x double> %op1 to <8 x i16> ret <8 x i16> %res @@ -628,6 +1009,35 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] +; NONEON-NOSVE-NEXT: adrp x8, .LCPI27_0 +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d +; NONEON-NOSVE-NEXT: fcvtzs v5.2d, v5.2d +; NONEON-NOSVE-NEXT: fcvtzs v4.2d, v4.2d +; NONEON-NOSVE-NEXT: fcvtzs v6.2d, v6.2d +; NONEON-NOSVE-NEXT: fcvtzs v7.2d, v7.2d +; NONEON-NOSVE-NEXT: xtn v19.2s, v0.2d +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI27_0] +; NONEON-NOSVE-NEXT: xtn v23.2s, v3.2d +; NONEON-NOSVE-NEXT: xtn v18.2s, v1.2d +; NONEON-NOSVE-NEXT: xtn v22.2s, v2.2d +; NONEON-NOSVE-NEXT: xtn v17.2s, v5.2d +; NONEON-NOSVE-NEXT: xtn v21.2s, v6.2d +; NONEON-NOSVE-NEXT: xtn v16.2s, v4.2d +; NONEON-NOSVE-NEXT: xtn v20.2s, v7.2d +; NONEON-NOSVE-NEXT: tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b +; NONEON-NOSVE-NEXT: tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptoui <16 x double> %op1 to <16 x i16> store <16 x i16> %res, ptr %b @@ -647,6 +1057,13 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i32> ret <1 x i32> %res } @@ -660,6 +1077,12 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i32> ret <2 x i32> %res } @@ -677,6 +1100,14 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) { ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i32> ret <4 x i32> %res @@ -701,6 +1132,19 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q2, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzu v3.2d, v3.2d +; NONEON-NOSVE-NEXT: fcvtzu v2.2d, v2.2d +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptoui <8 x double> %op1 to <8 x i32> store <8 x i32> %res, ptr %b @@ -719,6 +1163,12 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) { ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzu x8, d0 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i64> ret <1 x i64> %res } @@ -731,6 +1181,11 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) { ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i64> ret <2 x i64> %res } @@ -744,6 +1199,14 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i64> store <4 x i64> %res, ptr %b @@ -762,6 +1225,13 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) { ; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x half> %op1 to <4 x i16> ret <4 x i16> %res } @@ -774,6 +1244,21 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptosi <8 x half> %op1 to <8 x i16> store <8 x i16> %res, ptr %b @@ -789,6 +1274,27 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzs v2.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtzs v3.4s, v3.4s +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i16> store <16 x i16> %res, ptr %b @@ -808,6 +1314,13 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) { ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x half> %op1 to <2 x i32> ret <2 x i32> %res } @@ -821,6 +1334,12 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) { ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x half> %op1 to <4 x i32> ret <4 x i32> %res } @@ -837,6 +1356,20 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptosi <8 x half> %op1 to <8 x i32> store <8 x i32> %res, ptr %b @@ -861,6 +1394,26 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzs v2.4s, v2.4s +; NONEON-NOSVE-NEXT: fcvtzs v3.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i32> store <16 x i32> %res, ptr %b @@ -877,6 +1430,13 @@ define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) { ; CHECK-NEXT: fcvtzs x8, h0 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x half> %op1 to <1 x i64> ret <1 x i64> %res } @@ -893,6 +1453,18 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x half> %op1 to <2 x i64> ret <2 x i64> %res } @@ -915,6 +1487,27 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: mov h1, v0.h[2] +; NONEON-NOSVE-NEXT: mov h2, v0.h[3] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s1 +; NONEON-NOSVE-NEXT: fcvtzs x10, s2 +; NONEON-NOSVE-NEXT: fcvtzs x11, s3 +; NONEON-NOSVE-NEXT: fmov d1, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fptosi <4 x half> %op1 to <4 x i64> store <4 x i64> %res, ptr %b @@ -952,6 +1545,47 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: mov h1, v0.h[2] +; NONEON-NOSVE-NEXT: mov h3, v0.h[3] +; NONEON-NOSVE-NEXT: mov h4, v0.h[1] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: mov h5, v2.h[2] +; NONEON-NOSVE-NEXT: mov h6, v2.h[3] +; NONEON-NOSVE-NEXT: mov h7, v2.h[1] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvtzs x13, s2 +; NONEON-NOSVE-NEXT: fcvtzs x8, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h7 +; NONEON-NOSVE-NEXT: fcvtzs x10, s3 +; NONEON-NOSVE-NEXT: fcvtzs x11, s4 +; NONEON-NOSVE-NEXT: fcvtzs x12, s5 +; NONEON-NOSVE-NEXT: fcvtzs x14, s6 +; NONEON-NOSVE-NEXT: fmov d3, x13 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: fcvtzs x8, s1 +; NONEON-NOSVE-NEXT: fmov d1, x9 +; NONEON-NOSVE-NEXT: fmov d2, x12 +; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: mov v3.d[1], x8 +; NONEON-NOSVE-NEXT: mov v2.d[1], x14 +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptosi <8 x half> %op1 to <8 x i64> store <8 x i64> %res, ptr %b @@ -1012,6 +1646,80 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q5, q2, [x1, #96] ; CHECK-NEXT: add sp, sp, #128 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] +; NONEON-NOSVE-NEXT: mov h5, v1.h[3] +; NONEON-NOSVE-NEXT: mov h7, v0.h[2] +; NONEON-NOSVE-NEXT: mov h16, v0.h[3] +; NONEON-NOSVE-NEXT: fcvt s6, h0 +; NONEON-NOSVE-NEXT: mov h0, v0.h[1] +; NONEON-NOSVE-NEXT: mov h1, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s17, h4 +; NONEON-NOSVE-NEXT: mov h18, v4.h[2] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvtzs x8, s3 +; NONEON-NOSVE-NEXT: fcvt s3, h5 +; NONEON-NOSVE-NEXT: fcvt s5, h7 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: mov h16, v4.h[3] +; NONEON-NOSVE-NEXT: fcvtzs x9, s6 +; NONEON-NOSVE-NEXT: ldr d6, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: mov h4, v4.h[1] +; NONEON-NOSVE-NEXT: fcvtzs x11, s2 +; NONEON-NOSVE-NEXT: mov h2, v6.h[2] +; NONEON-NOSVE-NEXT: fcvtzs x10, s17 +; NONEON-NOSVE-NEXT: fcvtzs x13, s5 +; NONEON-NOSVE-NEXT: fcvtzs x12, s3 +; NONEON-NOSVE-NEXT: mov h3, v6.h[3] +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: mov h5, v6.h[1] +; NONEON-NOSVE-NEXT: fcvt s17, h18 +; NONEON-NOSVE-NEXT: fcvtzs x14, s7 +; NONEON-NOSVE-NEXT: fmov d7, x8 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fmov d0, x11 +; NONEON-NOSVE-NEXT: fcvtzs x11, s1 +; NONEON-NOSVE-NEXT: fmov d1, x13 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvtzs x13, s16 +; NONEON-NOSVE-NEXT: fmov d16, x9 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvtzs x15, s17 +; NONEON-NOSVE-NEXT: mov v0.d[1], x12 +; NONEON-NOSVE-NEXT: mov v1.d[1], x14 +; NONEON-NOSVE-NEXT: fcvtzs x9, s2 +; NONEON-NOSVE-NEXT: mov v16.d[1], x8 +; NONEON-NOSVE-NEXT: fcvtzs x8, s6 +; NONEON-NOSVE-NEXT: fcvtzs x14, s4 +; NONEON-NOSVE-NEXT: fcvtzs x12, s3 +; NONEON-NOSVE-NEXT: mov v7.d[1], x11 +; NONEON-NOSVE-NEXT: fmov d3, x10 +; NONEON-NOSVE-NEXT: fcvtzs x11, s5 +; NONEON-NOSVE-NEXT: fmov d2, x15 +; NONEON-NOSVE-NEXT: stp q16, q1, [x1, #64] +; NONEON-NOSVE-NEXT: fmov d1, x9 +; NONEON-NOSVE-NEXT: fmov d4, x8 +; NONEON-NOSVE-NEXT: stp q7, q0, [x1] +; NONEON-NOSVE-NEXT: mov v2.d[1], x13 +; NONEON-NOSVE-NEXT: mov v3.d[1], x14 +; NONEON-NOSVE-NEXT: mov v1.d[1], x12 +; NONEON-NOSVE-NEXT: mov v4.d[1], x11 +; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #96] +; NONEON-NOSVE-NEXT: stp q4, q1, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i64> store <16 x i64> %res, ptr %b @@ -1030,6 +1738,11 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) { ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i16> ret <2 x i16> %res } @@ -1043,6 +1756,12 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x float> %op1 to <4 x i16> ret <4 x i16> %res } @@ -1060,6 +1779,14 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i16> ret <8 x i16> %res @@ -1084,6 +1811,19 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q2, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v16f32_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzs v3.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtzs v2.4s, v2.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x float>, ptr %a %res = fptosi <16 x float> %op1 to <16 x i16> store <16 x i16> %res, ptr %b @@ -1102,6 +1842,11 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) { ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i32> ret <2 x i32> %res } @@ -1114,6 +1859,11 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) { ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x float> %op1 to <4 x i32> ret <4 x i32> %res } @@ -1127,6 +1877,14 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i32> store <8 x i32> %res, ptr %b @@ -1146,6 +1904,13 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) { ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v1f32_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x float> %op1 to <1 x i64> ret <1 x i64> %res } @@ -1159,6 +1924,12 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) { ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i64> ret <2 x i64> %res } @@ -1175,6 +1946,20 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fptosi <4 x float> %op1 to <4 x i64> store <4 x i64> %res, ptr %b @@ -1199,6 +1984,26 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s +; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s +; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d +; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i64> store <8 x i64> %res, ptr %b @@ -1218,6 +2023,12 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) { ; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i16> ret <1 x i16> %res } @@ -1231,6 +2042,12 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i16> ret <2 x i16> %res } @@ -1259,6 +2076,15 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i16> ret <4 x i16> %res @@ -1302,6 +2128,23 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) { ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: adrp x8, .LCPI61_0 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d +; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d +; NONEON-NOSVE-NEXT: xtn v7.2s, v0.2d +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI61_0] +; NONEON-NOSVE-NEXT: xtn v6.2s, v1.2d +; NONEON-NOSVE-NEXT: xtn v5.2s, v2.2d +; NONEON-NOSVE-NEXT: xtn v4.2s, v3.2d +; NONEON-NOSVE-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptosi <8 x double> %op1 to <8 x i16> ret <8 x i16> %res @@ -1378,6 +2221,35 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] +; NONEON-NOSVE-NEXT: adrp x8, .LCPI62_0 +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d +; NONEON-NOSVE-NEXT: fcvtzs v5.2d, v5.2d +; NONEON-NOSVE-NEXT: fcvtzs v4.2d, v4.2d +; NONEON-NOSVE-NEXT: fcvtzs v6.2d, v6.2d +; NONEON-NOSVE-NEXT: fcvtzs v7.2d, v7.2d +; NONEON-NOSVE-NEXT: xtn v19.2s, v0.2d +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI62_0] +; NONEON-NOSVE-NEXT: xtn v23.2s, v3.2d +; NONEON-NOSVE-NEXT: xtn v18.2s, v1.2d +; NONEON-NOSVE-NEXT: xtn v22.2s, v2.2d +; NONEON-NOSVE-NEXT: xtn v17.2s, v5.2d +; NONEON-NOSVE-NEXT: xtn v21.2s, v6.2d +; NONEON-NOSVE-NEXT: xtn v16.2s, v4.2d +; NONEON-NOSVE-NEXT: xtn v20.2s, v7.2d +; NONEON-NOSVE-NEXT: tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b +; NONEON-NOSVE-NEXT: tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptosi <16 x double> %op1 to <16 x i16> store <16 x i16> %res, ptr %b @@ -1397,6 +2269,13 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i32> ret <1 x i32> %res } @@ -1410,6 +2289,12 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i32> ret <2 x i32> %res } @@ -1427,6 +2312,14 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) { ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i32> ret <4 x i32> %res @@ -1451,6 +2344,19 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q2, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d +; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptosi <8 x double> %op1 to <8 x i32> store <8 x i32> %res, ptr %b @@ -1469,6 +2375,12 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) { ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs x8, d0 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i64> ret <1 x i64> %res } @@ -1481,6 +2393,11 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) { ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i64> ret <2 x i64> %res } @@ -1494,6 +2411,14 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i64> store <4 x i64> %res, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index c1c7b5c05f5d55..d3b09374676556 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -27,6 +28,14 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: uzp1 v2.4h, v2.4h, v0.4h +; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 +; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2 ret <2 x half> %sel } @@ -45,6 +54,13 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 +; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2 ret <4 x half> %sel } @@ -64,6 +80,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: shl v2.8h, v2.8h, #15 +; NONEON-NOSVE-NEXT: cmlt v2.8h, v2.8h, #0 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2 ret <8 x half> %sel } @@ -80,6 +104,126 @@ define void @select_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: mov h4, v1.h[2] +; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt s6, h1 +; NONEON-NOSVE-NEXT: fcvt s7, h0 +; NONEON-NOSVE-NEXT: mov h16, v1.h[6] +; NONEON-NOSVE-NEXT: mov h17, v0.h[6] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: mov h3, v0.h[3] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[4] +; NONEON-NOSVE-NEXT: mov h7, v0.h[4] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: csetm w14, eq +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v1.h[5] +; NONEON-NOSVE-NEXT: mov h5, v0.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w12, eq +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w11, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v1.h[7] +; NONEON-NOSVE-NEXT: mov h7, v0.h[7] +; NONEON-NOSVE-NEXT: mov h18, v3.h[3] +; NONEON-NOSVE-NEXT: csetm w13, eq +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: mov h4, v3.h[1] +; NONEON-NOSVE-NEXT: mov h5, v2.h[1] +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s17, s16 +; NONEON-NOSVE-NEXT: mov h16, v3.h[2] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: mov h17, v2.h[2] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: fcvt s6, h3 +; NONEON-NOSVE-NEXT: fcvt s7, h2 +; NONEON-NOSVE-NEXT: csetm w15, eq +; NONEON-NOSVE-NEXT: fcmp s5, s4 +; NONEON-NOSVE-NEXT: fmov s4, w14 +; NONEON-NOSVE-NEXT: csetm w16, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v2.h[3] +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: fcvt s16, h17 +; NONEON-NOSVE-NEXT: mov v4.h[1], w8 +; NONEON-NOSVE-NEXT: fcvt s17, h18 +; NONEON-NOSVE-NEXT: csetm w14, eq +; NONEON-NOSVE-NEXT: fmov s5, w14 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcmp s16, s7 +; NONEON-NOSVE-NEXT: mov h7, v3.h[4] +; NONEON-NOSVE-NEXT: mov h16, v2.h[4] +; NONEON-NOSVE-NEXT: mov v4.h[2], w12 +; NONEON-NOSVE-NEXT: mov v5.h[1], w16 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s6, s17 +; NONEON-NOSVE-NEXT: mov h17, v2.h[5] +; NONEON-NOSVE-NEXT: fcvt s6, h7 +; NONEON-NOSVE-NEXT: fcvt s7, h16 +; NONEON-NOSVE-NEXT: mov h16, v3.h[5] +; NONEON-NOSVE-NEXT: mov v4.h[3], w11 +; NONEON-NOSVE-NEXT: mov v5.h[2], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov h6, v3.h[6] +; NONEON-NOSVE-NEXT: mov h7, v2.h[6] +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: mov v4.h[4], w13 +; NONEON-NOSVE-NEXT: mov v5.h[3], w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcmp s17, s16 +; NONEON-NOSVE-NEXT: mov h16, v3.h[7] +; NONEON-NOSVE-NEXT: mov h17, v2.h[7] +; NONEON-NOSVE-NEXT: mov v5.h[4], w8 +; NONEON-NOSVE-NEXT: mov v4.h[5], w9 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: fcvt s6, h16 +; NONEON-NOSVE-NEXT: fcvt s7, h17 +; NONEON-NOSVE-NEXT: mov v5.h[5], w8 +; NONEON-NOSVE-NEXT: mov v4.h[6], w10 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcmp s7, s6 +; NONEON-NOSVE-NEXT: mov v5.h[6], w8 +; NONEON-NOSVE-NEXT: mov v4.h[7], w15 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: mov v5.h[7], w8 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %mask = fcmp oeq <16 x half> %op1, %op2 @@ -102,6 +246,13 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v2.2s, v2.2s, #31 +; NONEON-NOSVE-NEXT: cmlt v2.2s, v2.2s, #0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2 ret <2 x float> %sel } @@ -121,6 +272,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: shl v2.4s, v2.4s, #31 +; NONEON-NOSVE-NEXT: cmlt v2.4s, v2.4s, #0 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2 ret <4 x float> %sel } @@ -137,6 +296,18 @@ define void @select_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: fcmeq v4.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcmeq v5.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %mask = fcmp oeq <8 x float> %op1, %op2 @@ -151,6 +322,14 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: fcsel d0, d0, d1, ne ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm x8, ne +; NONEON-NOSVE-NEXT: fmov d2, x8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2 ret <1 x double> %sel } @@ -170,6 +349,14 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> ; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: shl v2.2d, v2.2d, #63 +; NONEON-NOSVE-NEXT: cmlt v2.2d, v2.2d, #0 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2 ret <2 x double> %sel } @@ -186,6 +373,18 @@ define void @select_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: fcmeq v4.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: fcmeq v5.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %mask = fcmp oeq <4 x double> %op1, %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll index ff38db8c10c04b..ae97a266c6ff0d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -21,6 +22,14 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) { ; CHECK-NEXT: mov z0.h, p0/m, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x i8> %op1, i8 5, i64 3 ret <4 x i8> %r } @@ -38,6 +47,14 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) { ; CHECK-NEXT: mov z0.b, p0/m, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v0.b[7], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %r = insertelement <8 x i8> %op1, i8 5, i64 7 ret <8 x i8> %r } @@ -55,6 +72,12 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) { ; CHECK-NEXT: mov z0.b, p0/m, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v0.b[15], w8 +; NONEON-NOSVE-NEXT: ret %r = insertelement <16 x i8> %op1, i8 5, i64 15 ret <16 x i8> %r } @@ -72,6 +95,12 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) { ; CHECK-NEXT: mov z1.b, p0/m, w8 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v1.b[15], w8 +; NONEON-NOSVE-NEXT: ret %r = insertelement <32 x i8> %op1, i8 5, i64 31 ret <32 x i8> %r } @@ -90,6 +119,14 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) { ; CHECK-NEXT: mov z0.s, p0/m, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v0.s[1], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x i16> %op1, i16 5, i64 1 ret <2 x i16> %r } @@ -107,6 +144,14 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) { ; CHECK-NEXT: mov z0.h, p0/m, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x i16> %op1, i16 5, i64 3 ret <4 x i16> %r } @@ -124,6 +169,12 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) { ; CHECK-NEXT: mov z0.h, p0/m, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v0.h[7], w8 +; NONEON-NOSVE-NEXT: ret %r = insertelement <8 x i16> %op1, i16 5, i64 7 ret <8 x i16> %r } @@ -141,6 +192,12 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) { ; CHECK-NEXT: mov z1.h, p0/m, w8 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v1.h[7], w8 +; NONEON-NOSVE-NEXT: ret %r = insertelement <16 x i16> %op1, i16 5, i64 15 ret <16 x i16> %r } @@ -159,6 +216,14 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) { ; CHECK-NEXT: mov z0.s, p0/m, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v0.s[1], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x i32> %op1, i32 5, i64 1 ret <2 x i32> %r } @@ -176,6 +241,12 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) { ; CHECK-NEXT: mov z0.s, p0/m, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x i32> %op1, i32 5, i64 3 ret <4 x i32> %r } @@ -193,6 +264,13 @@ define <8 x i32> @insertelement_v8i32(ptr %a) { ; CHECK-NEXT: mov z1.s, p0/m, w8 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %r = insertelement <8 x i32> %op1, i32 5, i64 7 ret <8 x i32> %r @@ -205,6 +283,12 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) { ; CHECK-NEXT: mov z0.d, #5 // =0x5 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %r = insertelement <1 x i64> %op1, i64 5, i64 0 ret <1 x i64> %r } @@ -222,6 +306,12 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) { ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x i64> %op1, i64 5, i64 1 ret <2 x i64> %r } @@ -239,6 +329,13 @@ define <4 x i64> @insertelement_v4i64(ptr %a) { ; CHECK-NEXT: mov z1.d, p0/m, x8 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 +; NONEON-NOSVE-NEXT: mov v1.d[1], x8 +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %r = insertelement <4 x i64> %op1, i64 5, i64 3 ret <4 x i64> %r @@ -257,6 +354,16 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI14_0 +; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI14_0 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: ld1r { v1.4h }, [x8] +; NONEON-NOSVE-NEXT: mov v1.h[0], v0.h[0] +; NONEON-NOSVE-NEXT: fmov d0, d1 +; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x half> %op1, half 5.0, i64 1 ret <2 x half> %r } @@ -274,6 +381,15 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) { ; CHECK-NEXT: mov z0.h, p0/m, h1 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI15_0 +; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI15_0 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x8] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x half> %op1, half 5.0, i64 3 ret <4 x half> %r } @@ -291,6 +407,13 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) { ; CHECK-NEXT: mov z0.h, p0/m, h1 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI16_0 +; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI16_0 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[7], [x8] +; NONEON-NOSVE-NEXT: ret %r = insertelement <8 x half> %op1, half 5.0, i64 7 ret <8 x half> %r } @@ -308,6 +431,14 @@ define <16 x half> @insertelement_v16f16(ptr %a) { ; CHECK-NEXT: mov z1.h, p0/m, h2 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: adrp x8, .LCPI17_0 +; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI17_0 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[7], [x8] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %r = insertelement <16 x half> %op1, half 5.0, i64 15 ret <16 x half> %r @@ -327,6 +458,14 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) { ; CHECK-NEXT: mov z0.s, p0/m, s1 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov s1, #5.00000000 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: mov v0.s[1], v1.s[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x float> %op1, float 5.0, i64 1 ret <2 x float> %r } @@ -344,6 +483,12 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) { ; CHECK-NEXT: mov z0.s, p0/m, s1 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov s1, #5.00000000 +; NONEON-NOSVE-NEXT: mov v0.s[3], v1.s[0] +; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x float> %op1, float 5.0, i64 3 ret <4 x float> %r } @@ -361,6 +506,13 @@ define <8 x float> @insertelement_v8f32(ptr %a) { ; CHECK-NEXT: mov z1.s, p0/m, s2 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov s2, #5.00000000 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov v1.s[3], v2.s[0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %r = insertelement <8 x float> %op1, float 5.0, i64 7 ret <8 x float> %r @@ -372,6 +524,12 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) { ; CHECK: // %bb.0: ; CHECK-NEXT: fmov d0, #5.00000000 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %r = insertelement <1 x double> %op1, double 5.0, i64 0 ret <1 x double> %r } @@ -389,6 +547,12 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) { ; CHECK-NEXT: mov z0.d, p0/m, d1 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov d1, #5.00000000 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x double> %op1, double 5.0, i64 1 ret <2 x double> %r } @@ -406,6 +570,14 @@ define <4 x double> @insertelement_v4f64(ptr %a) { ; CHECK-NEXT: mov z1.d, p0/m, d2 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: insertelement_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov d0, #5.00000000 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: mov v1.d[1], v0.d[0] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %r = insertelement <4 x double> %op1, double 5.0, i64 3 ret <4 x double> %r diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll index ee1706bc7c3549..1b438559e05380 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -2,6 +2,7 @@ ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -16,6 +17,11 @@ define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = add <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -28,6 +34,11 @@ define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = add <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -40,6 +51,11 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = add <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -53,6 +69,15 @@ define void @add_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: add z1.b, z2.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: add v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = add <32 x i8> %op1, %op2 @@ -68,6 +93,11 @@ define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = add <2 x i16> %op1, %op2 ret <2 x i16> %res } @@ -80,6 +110,11 @@ define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = add <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -92,6 +127,11 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = add <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -105,6 +145,15 @@ define void @add_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: add z1.h, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = add <16 x i16> %op1, %op2 @@ -120,6 +169,11 @@ define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = add <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -132,6 +186,11 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = add <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -145,6 +204,15 @@ define void @add_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: add z1.s, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: add v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: add v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = add <8 x i32> %op1, %op2 @@ -160,6 +228,11 @@ define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: add z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = add <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -172,6 +245,11 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: add z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: add v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = add <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -185,6 +263,15 @@ define void @add_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: add z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: add v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: add v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = add <4 x i64> %op1, %op2 @@ -213,6 +300,11 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; SVE2-NEXT: mul z0.h, z0.h, z1.h ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = mul <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -234,6 +326,11 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; SVE2-NEXT: mul z0.b, z0.b, z1.b ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mul v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = mul <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -255,6 +352,11 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; SVE2-NEXT: mul z0.b, z0.b, z1.b ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mul v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = mul <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -279,6 +381,15 @@ define void @mul_v32i8(ptr %a, ptr %b) { ; SVE2-NEXT: mul z1.b, z2.b, z3.b ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: mul v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: mul v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = mul <32 x i8> %op1, %op2 @@ -303,6 +414,11 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; SVE2-NEXT: mul z0.s, z0.s, z1.s ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = mul <2 x i16> %op1, %op2 ret <2 x i16> %res } @@ -324,6 +440,11 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; SVE2-NEXT: mul z0.h, z0.h, z1.h ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = mul <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -345,6 +466,11 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; SVE2-NEXT: mul z0.h, z0.h, z1.h ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mul v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = mul <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -369,6 +495,15 @@ define void @mul_v16i16(ptr %a, ptr %b) { ; SVE2-NEXT: mul z1.h, z2.h, z3.h ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: mul v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: mul v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = mul <16 x i16> %op1, %op2 @@ -393,6 +528,11 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; SVE2-NEXT: mul z0.s, z0.s, z1.s ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = mul <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -414,6 +554,11 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; SVE2-NEXT: mul z0.s, z0.s, z1.s ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = mul <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -438,6 +583,15 @@ define void @mul_v8i32(ptr %a, ptr %b) { ; SVE2-NEXT: mul z1.s, z2.s, z3.s ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: mul v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: mul v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = mul <8 x i32> %op1, %op2 @@ -462,6 +616,16 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; SVE2-NEXT: mul z0.d, z0.d, z1.d ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: mul x8, x9, x8 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %res = mul <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -483,6 +647,18 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; SVE2-NEXT: mul z0.d, z0.d, z1.d ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x10, d1 +; NONEON-NOSVE-NEXT: fmov x11, d0 +; NONEON-NOSVE-NEXT: mov x8, v1.d[1] +; NONEON-NOSVE-NEXT: mov x9, v0.d[1] +; NONEON-NOSVE-NEXT: mul x10, x11, x10 +; NONEON-NOSVE-NEXT: mul x8, x9, x8 +; NONEON-NOSVE-NEXT: fmov d0, x10 +; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: ret %res = mul <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -507,6 +683,29 @@ define void @mul_v4i64(ptr %a, ptr %b) { ; SVE2-NEXT: mul z1.d, z2.d, z3.d ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: fmov x12, d2 +; NONEON-NOSVE-NEXT: mov x11, v2.d[1] +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: mov x10, v3.d[1] +; NONEON-NOSVE-NEXT: mov x13, v1.d[1] +; NONEON-NOSVE-NEXT: mov x14, v0.d[1] +; NONEON-NOSVE-NEXT: mul x8, x9, x8 +; NONEON-NOSVE-NEXT: fmov x9, d3 +; NONEON-NOSVE-NEXT: mul x10, x11, x10 +; NONEON-NOSVE-NEXT: mul x9, x12, x9 +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: mul x11, x14, x13 +; NONEON-NOSVE-NEXT: fmov d0, x9 +; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = mul <4 x i64> %op1, %op2 @@ -526,6 +725,11 @@ define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = sub <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -538,6 +742,11 @@ define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = sub <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -550,6 +759,11 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = sub <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -563,6 +777,15 @@ define void @sub_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sub z1.b, z2.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: sub v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = sub <32 x i8> %op1, %op2 @@ -578,6 +801,11 @@ define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = sub <2 x i16> %op1, %op2 ret <2 x i16> %res } @@ -590,6 +818,11 @@ define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = sub <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -602,6 +835,11 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = sub <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -615,6 +853,15 @@ define void @sub_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: sub z1.h, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: sub v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = sub <16 x i16> %op1, %op2 @@ -630,6 +877,11 @@ define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = sub <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -642,6 +894,11 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = sub <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -655,6 +912,15 @@ define void @sub_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: sub z1.s, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: sub v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = sub <8 x i32> %op1, %op2 @@ -670,6 +936,11 @@ define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = sub <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -682,6 +953,11 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = sub <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -695,6 +971,15 @@ define void @sub_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: sub z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: sub v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = sub <4 x i64> %op1, %op2 @@ -715,6 +1000,13 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) { ; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: abs v0.4h, v0.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false) ret <4 x i8> %res } @@ -727,6 +1019,11 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) { ; CHECK-NEXT: abs z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: abs v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false) ret <8 x i8> %res } @@ -739,6 +1036,11 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) { ; CHECK-NEXT: abs z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: abs v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false) ret <16 x i8> %res } @@ -752,6 +1054,14 @@ define void @abs_v32i8(ptr %a) { ; CHECK-NEXT: abs z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: abs v0.16b, v0.16b +; NONEON-NOSVE-NEXT: abs v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false) store <32 x i8> %res, ptr %a @@ -767,6 +1077,13 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) { ; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: abs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false) ret <2 x i16> %res } @@ -779,6 +1096,11 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) { ; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: abs v0.4h, v0.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false) ret <4 x i16> %res } @@ -791,6 +1113,11 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) { ; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: abs v0.8h, v0.8h +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false) ret <8 x i16> %res } @@ -804,6 +1131,14 @@ define void @abs_v16i16(ptr %a) { ; CHECK-NEXT: abs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: abs v0.8h, v0.8h +; NONEON-NOSVE-NEXT: abs v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false) store <16 x i16> %res, ptr %a @@ -818,6 +1153,11 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) { ; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: abs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) ret <2 x i32> %res } @@ -830,6 +1170,11 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) { ; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) ret <4 x i32> %res } @@ -843,6 +1188,14 @@ define void @abs_v8i32(ptr %a) { ; CHECK-NEXT: abs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: abs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) store <8 x i32> %res, ptr %a @@ -857,6 +1210,11 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) { ; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: abs d0, d0 +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false) ret <1 x i64> %res } @@ -869,6 +1227,11 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) { ; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) ret <2 x i64> %res } @@ -882,6 +1245,14 @@ define void @abs_v4i64(ptr %a) { ; CHECK-NEXT: abs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: abs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) store <4 x i64> %res, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll index c2f3bbfb51dd52..ee0ca0e60b5e51 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -18,6 +19,11 @@ define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmeq v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <8 x i8> %op1, %op2 %sext = sext <8 x i1> %cmp to <8 x i8> ret <8 x i8> %sext @@ -33,6 +39,11 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmeq v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <16 x i8> %op1, %op2 %sext = sext <16 x i1> %cmp to <16 x i8> ret <16 x i8> %sext @@ -50,6 +61,15 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmeq v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: cmeq v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %cmp = icmp eq <32 x i8> %op1, %op2 @@ -68,6 +88,11 @@ define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmeq v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <4 x i16> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i16> ret <4 x i16> %sext @@ -83,6 +108,11 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmeq v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <8 x i16> %op1, %op2 %sext = sext <8 x i1> %cmp to <8 x i16> ret <8 x i16> %sext @@ -100,6 +130,15 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmeq v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: cmeq v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %cmp = icmp eq <16 x i16> %op1, %op2 @@ -118,6 +157,11 @@ define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmeq v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <2 x i32> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i32> ret <2 x i32> %sext @@ -133,6 +177,11 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmeq v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <4 x i32> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %sext @@ -150,6 +199,15 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmeq v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: cmeq v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %cmp = icmp eq <8 x i32> %op1, %op2 @@ -168,6 +226,11 @@ define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmeq d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <1 x i64> %op1, %op2 %sext = sext <1 x i1> %cmp to <1 x i64> ret <1 x i64> %sext @@ -183,6 +246,11 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmeq v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <2 x i64> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i64> ret <2 x i64> %sext @@ -200,6 +268,15 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmeq v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: cmeq v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %cmp = icmp eq <4 x i64> %op1, %op2 @@ -224,6 +301,17 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_ne_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmeq v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: cmeq v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b +; NONEON-NOSVE-NEXT: mvn v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %cmp = icmp ne <32 x i8> %op1, %op2 @@ -246,6 +334,14 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_sge_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: cmge v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %op2 = load <8 x i16>, ptr %b %cmp = icmp sge <8 x i16> %op1, %op2 @@ -270,6 +366,15 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_sgt_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmgt v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: cmgt v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %cmp = icmp sgt <16 x i16> %op1, %op2 @@ -292,6 +397,14 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) { ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_sle_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: cmge v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %op2 = load <4 x i32>, ptr %b %cmp = icmp sle <4 x i32> %op1, %op2 @@ -316,6 +429,15 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_slt_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmgt v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: cmgt v1.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %cmp = icmp slt <8 x i32> %op1, %op2 @@ -338,6 +460,14 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_uge_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: cmhs v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b %cmp = icmp uge <2 x i64> %op1, %op2 @@ -360,6 +490,14 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_ugt_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: cmhi v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b %cmp = icmp ugt <2 x i64> %op1, %op2 @@ -382,6 +520,14 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_ule_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: cmhs v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b %cmp = icmp ule <2 x i64> %op1, %op2 @@ -404,6 +550,14 @@ define void @icmp_ult_v2i64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_ult_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: cmhi v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b %cmp = icmp ult <2 x i64> %op1, %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index e6fd775b4cfb9b..d79d6c18ed5a6e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -2,6 +2,7 @@ ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -24,6 +25,31 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 +; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 +; NONEON-NOSVE-NEXT: smov w8, v1.h[1] +; NONEON-NOSVE-NEXT: smov w9, v0.h[1] +; NONEON-NOSVE-NEXT: smov w10, v0.h[0] +; NONEON-NOSVE-NEXT: smov w11, v0.h[2] +; NONEON-NOSVE-NEXT: smov w12, v0.h[3] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: smov w9, v1.h[0] +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.h[2] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: smov w11, v1.h[3] +; NONEON-NOSVE-NEXT: fmov s0, w9 +; NONEON-NOSVE-NEXT: mov v0.h[1], w8 +; NONEON-NOSVE-NEXT: sdiv w8, w12, w11 +; NONEON-NOSVE-NEXT: mov v0.h[2], w10 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -51,6 +77,45 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: smov w8, v1.b[1] +; NONEON-NOSVE-NEXT: smov w9, v0.b[1] +; NONEON-NOSVE-NEXT: smov w10, v0.b[0] +; NONEON-NOSVE-NEXT: smov w11, v0.b[2] +; NONEON-NOSVE-NEXT: smov w12, v0.b[3] +; NONEON-NOSVE-NEXT: smov w13, v0.b[4] +; NONEON-NOSVE-NEXT: smov w14, v0.b[5] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: smov w9, v1.b[0] +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.b[2] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: smov w11, v1.b[3] +; NONEON-NOSVE-NEXT: fmov s2, w9 +; NONEON-NOSVE-NEXT: smov w9, v1.b[6] +; NONEON-NOSVE-NEXT: mov v2.b[1], w8 +; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 +; NONEON-NOSVE-NEXT: smov w12, v1.b[4] +; NONEON-NOSVE-NEXT: mov v2.b[2], w10 +; NONEON-NOSVE-NEXT: smov w10, v0.b[6] +; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 +; NONEON-NOSVE-NEXT: smov w13, v1.b[5] +; NONEON-NOSVE-NEXT: mov v2.b[3], w11 +; NONEON-NOSVE-NEXT: smov w11, v0.b[7] +; NONEON-NOSVE-NEXT: sdiv w8, w14, w13 +; NONEON-NOSVE-NEXT: mov v2.b[4], w12 +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.b[7] +; NONEON-NOSVE-NEXT: mov v2.b[5], w8 +; NONEON-NOSVE-NEXT: sdiv w8, w11, w10 +; NONEON-NOSVE-NEXT: mov v2.b[6], w9 +; NONEON-NOSVE-NEXT: mov v2.b[7], w8 +; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -98,6 +163,75 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smov w8, v1.b[1] +; NONEON-NOSVE-NEXT: smov w9, v0.b[1] +; NONEON-NOSVE-NEXT: smov w10, v0.b[0] +; NONEON-NOSVE-NEXT: smov w11, v0.b[2] +; NONEON-NOSVE-NEXT: smov w12, v0.b[3] +; NONEON-NOSVE-NEXT: smov w13, v0.b[4] +; NONEON-NOSVE-NEXT: smov w14, v0.b[5] +; NONEON-NOSVE-NEXT: smov w15, v0.b[6] +; NONEON-NOSVE-NEXT: smov w16, v0.b[7] +; NONEON-NOSVE-NEXT: smov w17, v0.b[8] +; NONEON-NOSVE-NEXT: smov w18, v0.b[9] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: smov w9, v1.b[0] +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.b[2] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: smov w11, v1.b[3] +; NONEON-NOSVE-NEXT: fmov s2, w9 +; NONEON-NOSVE-NEXT: smov w9, v1.b[10] +; NONEON-NOSVE-NEXT: mov v2.b[1], w8 +; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 +; NONEON-NOSVE-NEXT: smov w12, v1.b[4] +; NONEON-NOSVE-NEXT: mov v2.b[2], w10 +; NONEON-NOSVE-NEXT: smov w10, v0.b[10] +; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 +; NONEON-NOSVE-NEXT: smov w13, v1.b[5] +; NONEON-NOSVE-NEXT: mov v2.b[3], w11 +; NONEON-NOSVE-NEXT: smov w11, v0.b[11] +; NONEON-NOSVE-NEXT: sdiv w13, w14, w13 +; NONEON-NOSVE-NEXT: smov w14, v1.b[6] +; NONEON-NOSVE-NEXT: mov v2.b[4], w12 +; NONEON-NOSVE-NEXT: smov w12, v0.b[12] +; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 +; NONEON-NOSVE-NEXT: smov w15, v1.b[7] +; NONEON-NOSVE-NEXT: mov v2.b[5], w13 +; NONEON-NOSVE-NEXT: smov w13, v0.b[13] +; NONEON-NOSVE-NEXT: sdiv w15, w16, w15 +; NONEON-NOSVE-NEXT: smov w16, v1.b[8] +; NONEON-NOSVE-NEXT: mov v2.b[6], w14 +; NONEON-NOSVE-NEXT: sdiv w16, w17, w16 +; NONEON-NOSVE-NEXT: smov w17, v1.b[9] +; NONEON-NOSVE-NEXT: mov v2.b[7], w15 +; NONEON-NOSVE-NEXT: sdiv w8, w18, w17 +; NONEON-NOSVE-NEXT: mov v2.b[8], w16 +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.b[11] +; NONEON-NOSVE-NEXT: mov v2.b[9], w8 +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: smov w11, v1.b[12] +; NONEON-NOSVE-NEXT: mov v2.b[10], w9 +; NONEON-NOSVE-NEXT: smov w9, v1.b[14] +; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 +; NONEON-NOSVE-NEXT: smov w12, v1.b[13] +; NONEON-NOSVE-NEXT: mov v2.b[11], w10 +; NONEON-NOSVE-NEXT: smov w10, v1.b[15] +; NONEON-NOSVE-NEXT: sdiv w8, w13, w12 +; NONEON-NOSVE-NEXT: smov w12, v0.b[14] +; NONEON-NOSVE-NEXT: mov v2.b[12], w11 +; NONEON-NOSVE-NEXT: smov w11, v0.b[15] +; NONEON-NOSVE-NEXT: sdiv w9, w12, w9 +; NONEON-NOSVE-NEXT: mov v2.b[13], w8 +; NONEON-NOSVE-NEXT: sdiv w8, w11, w10 +; NONEON-NOSVE-NEXT: mov v2.b[14], w9 +; NONEON-NOSVE-NEXT: mov v2.b[15], w8 +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = sdiv <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -178,6 +312,163 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b ; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str x27, [sp, #-80]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: smov w8, v1.b[1] +; NONEON-NOSVE-NEXT: smov w9, v0.b[1] +; NONEON-NOSVE-NEXT: smov w10, v0.b[0] +; NONEON-NOSVE-NEXT: smov w11, v0.b[2] +; NONEON-NOSVE-NEXT: smov w12, v0.b[3] +; NONEON-NOSVE-NEXT: smov w13, v0.b[4] +; NONEON-NOSVE-NEXT: smov w14, v0.b[5] +; NONEON-NOSVE-NEXT: smov w15, v0.b[6] +; NONEON-NOSVE-NEXT: smov w17, v0.b[8] +; NONEON-NOSVE-NEXT: smov w2, v0.b[10] +; NONEON-NOSVE-NEXT: smov w3, v0.b[11] +; NONEON-NOSVE-NEXT: smov w4, v0.b[12] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: smov w9, v1.b[0] +; NONEON-NOSVE-NEXT: smov w5, v0.b[13] +; NONEON-NOSVE-NEXT: smov w6, v0.b[14] +; NONEON-NOSVE-NEXT: smov w1, v3.b[1] +; NONEON-NOSVE-NEXT: smov w7, v2.b[0] +; NONEON-NOSVE-NEXT: smov w19, v2.b[2] +; NONEON-NOSVE-NEXT: smov w20, v2.b[3] +; NONEON-NOSVE-NEXT: smov w21, v2.b[4] +; NONEON-NOSVE-NEXT: smov w22, v2.b[5] +; NONEON-NOSVE-NEXT: smov w23, v2.b[6] +; NONEON-NOSVE-NEXT: smov w24, v2.b[7] +; NONEON-NOSVE-NEXT: smov w25, v2.b[8] +; NONEON-NOSVE-NEXT: smov w26, v2.b[9] +; NONEON-NOSVE-NEXT: smov w27, v2.b[10] +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.b[2] +; NONEON-NOSVE-NEXT: sdiv w11, w11, w10 +; NONEON-NOSVE-NEXT: smov w10, v1.b[3] +; NONEON-NOSVE-NEXT: fmov s5, w9 +; NONEON-NOSVE-NEXT: smov w9, v3.b[11] +; NONEON-NOSVE-NEXT: mov v5.b[1], w8 +; NONEON-NOSVE-NEXT: sdiv w10, w12, w10 +; NONEON-NOSVE-NEXT: smov w12, v1.b[4] +; NONEON-NOSVE-NEXT: mov v5.b[2], w11 +; NONEON-NOSVE-NEXT: smov w11, v2.b[11] +; NONEON-NOSVE-NEXT: sdiv w13, w13, w12 +; NONEON-NOSVE-NEXT: smov w12, v1.b[5] +; NONEON-NOSVE-NEXT: mov v5.b[3], w10 +; NONEON-NOSVE-NEXT: smov w10, v3.b[12] +; NONEON-NOSVE-NEXT: sdiv w12, w14, w12 +; NONEON-NOSVE-NEXT: smov w14, v1.b[6] +; NONEON-NOSVE-NEXT: mov v5.b[4], w13 +; NONEON-NOSVE-NEXT: smov w13, v2.b[14] +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: smov w14, v1.b[7] +; NONEON-NOSVE-NEXT: smov w15, v0.b[7] +; NONEON-NOSVE-NEXT: mov v5.b[5], w12 +; NONEON-NOSVE-NEXT: smov w12, v2.b[13] +; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 +; NONEON-NOSVE-NEXT: smov w15, v1.b[8] +; NONEON-NOSVE-NEXT: mov v5.b[6], w16 +; NONEON-NOSVE-NEXT: sdiv w18, w17, w15 +; NONEON-NOSVE-NEXT: smov w15, v1.b[9] +; NONEON-NOSVE-NEXT: smov w17, v0.b[9] +; NONEON-NOSVE-NEXT: mov v5.b[7], w14 +; NONEON-NOSVE-NEXT: sdiv w17, w17, w15 +; NONEON-NOSVE-NEXT: smov w15, v1.b[10] +; NONEON-NOSVE-NEXT: mov v5.b[8], w18 +; NONEON-NOSVE-NEXT: sdiv w15, w2, w15 +; NONEON-NOSVE-NEXT: smov w2, v1.b[11] +; NONEON-NOSVE-NEXT: mov v5.b[9], w17 +; NONEON-NOSVE-NEXT: sdiv w2, w3, w2 +; NONEON-NOSVE-NEXT: smov w3, v1.b[12] +; NONEON-NOSVE-NEXT: mov v5.b[10], w15 +; NONEON-NOSVE-NEXT: sdiv w3, w4, w3 +; NONEON-NOSVE-NEXT: smov w4, v1.b[13] +; NONEON-NOSVE-NEXT: mov v5.b[11], w2 +; NONEON-NOSVE-NEXT: sdiv w4, w5, w4 +; NONEON-NOSVE-NEXT: smov w5, v1.b[14] +; NONEON-NOSVE-NEXT: mov v5.b[12], w3 +; NONEON-NOSVE-NEXT: sdiv w5, w6, w5 +; NONEON-NOSVE-NEXT: smov w6, v2.b[1] +; NONEON-NOSVE-NEXT: mov v5.b[13], w4 +; NONEON-NOSVE-NEXT: sdiv w1, w6, w1 +; NONEON-NOSVE-NEXT: smov w6, v3.b[0] +; NONEON-NOSVE-NEXT: mov v5.b[14], w5 +; NONEON-NOSVE-NEXT: sdiv w6, w7, w6 +; NONEON-NOSVE-NEXT: smov w7, v3.b[2] +; NONEON-NOSVE-NEXT: sdiv w7, w19, w7 +; NONEON-NOSVE-NEXT: smov w19, v3.b[3] +; NONEON-NOSVE-NEXT: fmov s4, w6 +; NONEON-NOSVE-NEXT: mov v4.b[1], w1 +; NONEON-NOSVE-NEXT: sdiv w19, w20, w19 +; NONEON-NOSVE-NEXT: smov w20, v3.b[4] +; NONEON-NOSVE-NEXT: mov v4.b[2], w7 +; NONEON-NOSVE-NEXT: sdiv w20, w21, w20 +; NONEON-NOSVE-NEXT: smov w21, v3.b[5] +; NONEON-NOSVE-NEXT: mov v4.b[3], w19 +; NONEON-NOSVE-NEXT: sdiv w21, w22, w21 +; NONEON-NOSVE-NEXT: smov w22, v3.b[6] +; NONEON-NOSVE-NEXT: mov v4.b[4], w20 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w22, w23, w22 +; NONEON-NOSVE-NEXT: smov w23, v3.b[7] +; NONEON-NOSVE-NEXT: mov v4.b[5], w21 +; NONEON-NOSVE-NEXT: sdiv w23, w24, w23 +; NONEON-NOSVE-NEXT: smov w24, v3.b[8] +; NONEON-NOSVE-NEXT: mov v4.b[6], w22 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w24, w25, w24 +; NONEON-NOSVE-NEXT: smov w25, v3.b[9] +; NONEON-NOSVE-NEXT: mov v4.b[7], w23 +; NONEON-NOSVE-NEXT: sdiv w25, w26, w25 +; NONEON-NOSVE-NEXT: smov w26, v3.b[10] +; NONEON-NOSVE-NEXT: mov v4.b[8], w24 +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w8, w27, w26 +; NONEON-NOSVE-NEXT: mov v4.b[9], w25 +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w9, w11, w9 +; NONEON-NOSVE-NEXT: smov w11, v2.b[12] +; NONEON-NOSVE-NEXT: mov v4.b[10], w8 +; NONEON-NOSVE-NEXT: smov w8, v3.b[15] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: smov w11, v3.b[13] +; NONEON-NOSVE-NEXT: mov v4.b[11], w9 +; NONEON-NOSVE-NEXT: smov w9, v1.b[15] +; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 +; NONEON-NOSVE-NEXT: smov w12, v3.b[14] +; NONEON-NOSVE-NEXT: mov v4.b[12], w10 +; NONEON-NOSVE-NEXT: smov w10, v0.b[15] +; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 +; NONEON-NOSVE-NEXT: smov w13, v2.b[15] +; NONEON-NOSVE-NEXT: mov v4.b[13], w11 +; NONEON-NOSVE-NEXT: sdiv w8, w13, w8 +; NONEON-NOSVE-NEXT: mov v4.b[14], w12 +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: mov v4.b[15], w8 +; NONEON-NOSVE-NEXT: mov v5.b[15], w9 +; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: ldr x27, [sp], #80 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = sdiv <32 x i8> %op1, %op2 @@ -196,6 +487,23 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16 +; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: fmov w9, s0 +; NONEON-NOSVE-NEXT: mov w10, v0.s[1] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: mov w9, v1.s[1] +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: mov v0.s[1], w9 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i16> %op1, %op2 ret <2 x i16> %res } @@ -212,6 +520,29 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: smov w8, v1.h[1] +; NONEON-NOSVE-NEXT: smov w9, v0.h[1] +; NONEON-NOSVE-NEXT: smov w10, v0.h[0] +; NONEON-NOSVE-NEXT: smov w11, v0.h[2] +; NONEON-NOSVE-NEXT: smov w12, v0.h[3] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: smov w9, v1.h[0] +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.h[2] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: smov w11, v1.h[3] +; NONEON-NOSVE-NEXT: fmov s0, w9 +; NONEON-NOSVE-NEXT: mov v0.h[1], w8 +; NONEON-NOSVE-NEXT: sdiv w8, w12, w11 +; NONEON-NOSVE-NEXT: mov v0.h[2], w10 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -238,6 +569,43 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smov w8, v1.h[1] +; NONEON-NOSVE-NEXT: smov w9, v0.h[1] +; NONEON-NOSVE-NEXT: smov w10, v0.h[0] +; NONEON-NOSVE-NEXT: smov w11, v0.h[2] +; NONEON-NOSVE-NEXT: smov w12, v0.h[3] +; NONEON-NOSVE-NEXT: smov w13, v0.h[4] +; NONEON-NOSVE-NEXT: smov w14, v0.h[5] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: smov w9, v1.h[0] +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.h[2] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: smov w11, v1.h[3] +; NONEON-NOSVE-NEXT: fmov s2, w9 +; NONEON-NOSVE-NEXT: smov w9, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[1], w8 +; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 +; NONEON-NOSVE-NEXT: smov w12, v1.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[2], w10 +; NONEON-NOSVE-NEXT: smov w10, v0.h[6] +; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 +; NONEON-NOSVE-NEXT: smov w13, v1.h[5] +; NONEON-NOSVE-NEXT: mov v2.h[3], w11 +; NONEON-NOSVE-NEXT: smov w11, v0.h[7] +; NONEON-NOSVE-NEXT: sdiv w8, w14, w13 +; NONEON-NOSVE-NEXT: mov v2.h[4], w12 +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.h[7] +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: sdiv w8, w11, w10 +; NONEON-NOSVE-NEXT: mov v2.h[6], w9 +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -278,6 +646,79 @@ define void @sdiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: splice z3.h, p0, z3.h, z1.h ; CHECK-NEXT: stp q3, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: smov w8, v1.h[1] +; NONEON-NOSVE-NEXT: smov w9, v0.h[1] +; NONEON-NOSVE-NEXT: smov w10, v0.h[0] +; NONEON-NOSVE-NEXT: smov w11, v0.h[2] +; NONEON-NOSVE-NEXT: smov w12, v0.h[3] +; NONEON-NOSVE-NEXT: smov w13, v0.h[4] +; NONEON-NOSVE-NEXT: smov w14, v0.h[5] +; NONEON-NOSVE-NEXT: smov w15, v0.h[6] +; NONEON-NOSVE-NEXT: smov w16, v2.h[1] +; NONEON-NOSVE-NEXT: smov w17, v2.h[0] +; NONEON-NOSVE-NEXT: smov w18, v2.h[2] +; NONEON-NOSVE-NEXT: smov w1, v2.h[3] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: smov w9, v1.h[0] +; NONEON-NOSVE-NEXT: smov w2, v2.h[4] +; NONEON-NOSVE-NEXT: smov w3, v2.h[5] +; NONEON-NOSVE-NEXT: smov w4, v2.h[6] +; NONEON-NOSVE-NEXT: sdiv w10, w10, w9 +; NONEON-NOSVE-NEXT: smov w9, v1.h[2] +; NONEON-NOSVE-NEXT: sdiv w9, w11, w9 +; NONEON-NOSVE-NEXT: smov w11, v1.h[3] +; NONEON-NOSVE-NEXT: fmov s5, w10 +; NONEON-NOSVE-NEXT: smov w10, v3.h[7] +; NONEON-NOSVE-NEXT: mov v5.h[1], w8 +; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 +; NONEON-NOSVE-NEXT: smov w12, v1.h[4] +; NONEON-NOSVE-NEXT: mov v5.h[2], w9 +; NONEON-NOSVE-NEXT: smov w9, v2.h[7] +; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 +; NONEON-NOSVE-NEXT: smov w13, v1.h[5] +; NONEON-NOSVE-NEXT: mov v5.h[3], w11 +; NONEON-NOSVE-NEXT: smov w11, v0.h[7] +; NONEON-NOSVE-NEXT: sdiv w13, w14, w13 +; NONEON-NOSVE-NEXT: smov w14, v1.h[6] +; NONEON-NOSVE-NEXT: mov v5.h[4], w12 +; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 +; NONEON-NOSVE-NEXT: smov w15, v3.h[1] +; NONEON-NOSVE-NEXT: mov v5.h[5], w13 +; NONEON-NOSVE-NEXT: sdiv w15, w16, w15 +; NONEON-NOSVE-NEXT: smov w16, v3.h[0] +; NONEON-NOSVE-NEXT: mov v5.h[6], w14 +; NONEON-NOSVE-NEXT: sdiv w16, w17, w16 +; NONEON-NOSVE-NEXT: smov w17, v3.h[2] +; NONEON-NOSVE-NEXT: sdiv w17, w18, w17 +; NONEON-NOSVE-NEXT: smov w18, v3.h[3] +; NONEON-NOSVE-NEXT: fmov s4, w16 +; NONEON-NOSVE-NEXT: mov v4.h[1], w15 +; NONEON-NOSVE-NEXT: sdiv w18, w1, w18 +; NONEON-NOSVE-NEXT: smov w1, v3.h[4] +; NONEON-NOSVE-NEXT: mov v4.h[2], w17 +; NONEON-NOSVE-NEXT: sdiv w1, w2, w1 +; NONEON-NOSVE-NEXT: smov w2, v3.h[5] +; NONEON-NOSVE-NEXT: mov v4.h[3], w18 +; NONEON-NOSVE-NEXT: sdiv w2, w3, w2 +; NONEON-NOSVE-NEXT: smov w3, v3.h[6] +; NONEON-NOSVE-NEXT: mov v4.h[4], w1 +; NONEON-NOSVE-NEXT: sdiv w8, w4, w3 +; NONEON-NOSVE-NEXT: mov v4.h[5], w2 +; NONEON-NOSVE-NEXT: sdiv w9, w9, w10 +; NONEON-NOSVE-NEXT: smov w10, v1.h[7] +; NONEON-NOSVE-NEXT: mov v4.h[6], w8 +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: mov v4.h[7], w9 +; NONEON-NOSVE-NEXT: mov v5.h[7], w10 +; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = sdiv <16 x i16> %op1, %op2 @@ -294,6 +735,21 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: fmov w9, s0 +; NONEON-NOSVE-NEXT: mov w10, v0.s[1] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: mov w9, v1.s[1] +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: mov v0.s[1], w9 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -307,6 +763,26 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, v1.s[1] +; NONEON-NOSVE-NEXT: mov w9, v0.s[1] +; NONEON-NOSVE-NEXT: fmov w10, s0 +; NONEON-NOSVE-NEXT: mov w11, v0.s[2] +; NONEON-NOSVE-NEXT: mov w12, v0.s[3] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: fmov w9, s1 +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: mov w10, v1.s[2] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: mov w11, v1.s[3] +; NONEON-NOSVE-NEXT: fmov s0, w9 +; NONEON-NOSVE-NEXT: mov v0.s[1], w8 +; NONEON-NOSVE-NEXT: sdiv w8, w12, w11 +; NONEON-NOSVE-NEXT: mov v0.s[2], w10 +; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -322,6 +798,45 @@ define void @sdiv_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: mov w9, v0.s[1] +; NONEON-NOSVE-NEXT: fmov w10, s0 +; NONEON-NOSVE-NEXT: mov w11, v0.s[2] +; NONEON-NOSVE-NEXT: mov w8, v1.s[1] +; NONEON-NOSVE-NEXT: mov w12, v2.s[1] +; NONEON-NOSVE-NEXT: fmov w13, s2 +; NONEON-NOSVE-NEXT: mov w14, v2.s[2] +; NONEON-NOSVE-NEXT: mov w15, v2.s[3] +; NONEON-NOSVE-NEXT: mov w16, v0.s[3] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: fmov w9, s1 +; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 +; NONEON-NOSVE-NEXT: mov w10, v1.s[2] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 +; NONEON-NOSVE-NEXT: mov w11, v3.s[1] +; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 +; NONEON-NOSVE-NEXT: fmov w12, s3 +; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 +; NONEON-NOSVE-NEXT: mov w13, v3.s[2] +; NONEON-NOSVE-NEXT: sdiv w13, w14, w13 +; NONEON-NOSVE-NEXT: mov w14, v3.s[3] +; NONEON-NOSVE-NEXT: fmov s0, w12 +; NONEON-NOSVE-NEXT: mov v0.s[1], w11 +; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 +; NONEON-NOSVE-NEXT: mov w15, v1.s[3] +; NONEON-NOSVE-NEXT: fmov s1, w9 +; NONEON-NOSVE-NEXT: mov v0.s[2], w13 +; NONEON-NOSVE-NEXT: mov v1.s[1], w8 +; NONEON-NOSVE-NEXT: mov v1.s[2], w10 +; NONEON-NOSVE-NEXT: sdiv w8, w16, w15 +; NONEON-NOSVE-NEXT: mov v0.s[3], w14 +; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = sdiv <8 x i32> %op1, %op2 @@ -338,6 +853,16 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %res = sdiv <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -351,6 +876,18 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: mov x10, v0.d[1] +; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 +; NONEON-NOSVE-NEXT: mov x9, v1.d[1] +; NONEON-NOSVE-NEXT: sdiv x9, x10, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -366,6 +903,29 @@ define void @sdiv_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: mov x10, v2.d[1] +; NONEON-NOSVE-NEXT: fmov x11, d2 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 +; NONEON-NOSVE-NEXT: mov x9, v3.d[1] +; NONEON-NOSVE-NEXT: sdiv x9, x10, x9 +; NONEON-NOSVE-NEXT: fmov x10, d3 +; NONEON-NOSVE-NEXT: sdiv x10, x11, x10 +; NONEON-NOSVE-NEXT: mov x11, v1.d[1] +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: sdiv x11, x12, x11 +; NONEON-NOSVE-NEXT: fmov d0, x10 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = sdiv <4 x i64> %op1, %op2 @@ -391,6 +951,37 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: umov w8, v1.h[1] +; NONEON-NOSVE-NEXT: umov w9, v0.h[1] +; NONEON-NOSVE-NEXT: umov w10, v0.h[0] +; NONEON-NOSVE-NEXT: umov w11, v0.h[2] +; NONEON-NOSVE-NEXT: umov w12, v0.h[3] +; NONEON-NOSVE-NEXT: and w8, w8, #0xff +; NONEON-NOSVE-NEXT: and w9, w9, #0xff +; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: umov w9, v1.h[0] +; NONEON-NOSVE-NEXT: and w11, w11, #0xff +; NONEON-NOSVE-NEXT: and w9, w9, #0xff +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.h[2] +; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: umov w11, v1.h[3] +; NONEON-NOSVE-NEXT: fmov s0, w9 +; NONEON-NOSVE-NEXT: mov v0.h[1], w8 +; NONEON-NOSVE-NEXT: and w9, w11, #0xff +; NONEON-NOSVE-NEXT: and w11, w12, #0xff +; NONEON-NOSVE-NEXT: udiv w8, w11, w9 +; NONEON-NOSVE-NEXT: mov v0.h[2], w10 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = udiv <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -418,6 +1009,45 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: umov w8, v1.b[1] +; NONEON-NOSVE-NEXT: umov w9, v0.b[1] +; NONEON-NOSVE-NEXT: umov w10, v0.b[0] +; NONEON-NOSVE-NEXT: umov w11, v0.b[2] +; NONEON-NOSVE-NEXT: umov w12, v0.b[3] +; NONEON-NOSVE-NEXT: umov w13, v0.b[4] +; NONEON-NOSVE-NEXT: umov w14, v0.b[5] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: umov w9, v1.b[0] +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.b[2] +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: umov w11, v1.b[3] +; NONEON-NOSVE-NEXT: fmov s2, w9 +; NONEON-NOSVE-NEXT: umov w9, v1.b[6] +; NONEON-NOSVE-NEXT: mov v2.b[1], w8 +; NONEON-NOSVE-NEXT: udiv w11, w12, w11 +; NONEON-NOSVE-NEXT: umov w12, v1.b[4] +; NONEON-NOSVE-NEXT: mov v2.b[2], w10 +; NONEON-NOSVE-NEXT: umov w10, v0.b[6] +; NONEON-NOSVE-NEXT: udiv w12, w13, w12 +; NONEON-NOSVE-NEXT: umov w13, v1.b[5] +; NONEON-NOSVE-NEXT: mov v2.b[3], w11 +; NONEON-NOSVE-NEXT: umov w11, v0.b[7] +; NONEON-NOSVE-NEXT: udiv w8, w14, w13 +; NONEON-NOSVE-NEXT: mov v2.b[4], w12 +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.b[7] +; NONEON-NOSVE-NEXT: mov v2.b[5], w8 +; NONEON-NOSVE-NEXT: udiv w8, w11, w10 +; NONEON-NOSVE-NEXT: mov v2.b[6], w9 +; NONEON-NOSVE-NEXT: mov v2.b[7], w8 +; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ret %res = udiv <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -465,6 +1095,75 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umov w8, v1.b[1] +; NONEON-NOSVE-NEXT: umov w9, v0.b[1] +; NONEON-NOSVE-NEXT: umov w10, v0.b[0] +; NONEON-NOSVE-NEXT: umov w11, v0.b[2] +; NONEON-NOSVE-NEXT: umov w12, v0.b[3] +; NONEON-NOSVE-NEXT: umov w13, v0.b[4] +; NONEON-NOSVE-NEXT: umov w14, v0.b[5] +; NONEON-NOSVE-NEXT: umov w15, v0.b[6] +; NONEON-NOSVE-NEXT: umov w16, v0.b[7] +; NONEON-NOSVE-NEXT: umov w17, v0.b[8] +; NONEON-NOSVE-NEXT: umov w18, v0.b[9] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: umov w9, v1.b[0] +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.b[2] +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: umov w11, v1.b[3] +; NONEON-NOSVE-NEXT: fmov s2, w9 +; NONEON-NOSVE-NEXT: umov w9, v1.b[10] +; NONEON-NOSVE-NEXT: mov v2.b[1], w8 +; NONEON-NOSVE-NEXT: udiv w11, w12, w11 +; NONEON-NOSVE-NEXT: umov w12, v1.b[4] +; NONEON-NOSVE-NEXT: mov v2.b[2], w10 +; NONEON-NOSVE-NEXT: umov w10, v0.b[10] +; NONEON-NOSVE-NEXT: udiv w12, w13, w12 +; NONEON-NOSVE-NEXT: umov w13, v1.b[5] +; NONEON-NOSVE-NEXT: mov v2.b[3], w11 +; NONEON-NOSVE-NEXT: umov w11, v0.b[11] +; NONEON-NOSVE-NEXT: udiv w13, w14, w13 +; NONEON-NOSVE-NEXT: umov w14, v1.b[6] +; NONEON-NOSVE-NEXT: mov v2.b[4], w12 +; NONEON-NOSVE-NEXT: umov w12, v0.b[12] +; NONEON-NOSVE-NEXT: udiv w14, w15, w14 +; NONEON-NOSVE-NEXT: umov w15, v1.b[7] +; NONEON-NOSVE-NEXT: mov v2.b[5], w13 +; NONEON-NOSVE-NEXT: umov w13, v0.b[13] +; NONEON-NOSVE-NEXT: udiv w15, w16, w15 +; NONEON-NOSVE-NEXT: umov w16, v1.b[8] +; NONEON-NOSVE-NEXT: mov v2.b[6], w14 +; NONEON-NOSVE-NEXT: udiv w16, w17, w16 +; NONEON-NOSVE-NEXT: umov w17, v1.b[9] +; NONEON-NOSVE-NEXT: mov v2.b[7], w15 +; NONEON-NOSVE-NEXT: udiv w8, w18, w17 +; NONEON-NOSVE-NEXT: mov v2.b[8], w16 +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.b[11] +; NONEON-NOSVE-NEXT: mov v2.b[9], w8 +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: umov w11, v1.b[12] +; NONEON-NOSVE-NEXT: mov v2.b[10], w9 +; NONEON-NOSVE-NEXT: umov w9, v1.b[14] +; NONEON-NOSVE-NEXT: udiv w11, w12, w11 +; NONEON-NOSVE-NEXT: umov w12, v1.b[13] +; NONEON-NOSVE-NEXT: mov v2.b[11], w10 +; NONEON-NOSVE-NEXT: umov w10, v1.b[15] +; NONEON-NOSVE-NEXT: udiv w8, w13, w12 +; NONEON-NOSVE-NEXT: umov w12, v0.b[14] +; NONEON-NOSVE-NEXT: mov v2.b[12], w11 +; NONEON-NOSVE-NEXT: umov w11, v0.b[15] +; NONEON-NOSVE-NEXT: udiv w9, w12, w9 +; NONEON-NOSVE-NEXT: mov v2.b[13], w8 +; NONEON-NOSVE-NEXT: udiv w8, w11, w10 +; NONEON-NOSVE-NEXT: mov v2.b[14], w9 +; NONEON-NOSVE-NEXT: mov v2.b[15], w8 +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = udiv <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -545,6 +1244,163 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b ; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str x27, [sp, #-80]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: umov w8, v1.b[1] +; NONEON-NOSVE-NEXT: umov w9, v0.b[1] +; NONEON-NOSVE-NEXT: umov w10, v0.b[0] +; NONEON-NOSVE-NEXT: umov w11, v0.b[2] +; NONEON-NOSVE-NEXT: umov w12, v0.b[3] +; NONEON-NOSVE-NEXT: umov w13, v0.b[4] +; NONEON-NOSVE-NEXT: umov w14, v0.b[5] +; NONEON-NOSVE-NEXT: umov w15, v0.b[6] +; NONEON-NOSVE-NEXT: umov w17, v0.b[8] +; NONEON-NOSVE-NEXT: umov w2, v0.b[10] +; NONEON-NOSVE-NEXT: umov w3, v0.b[11] +; NONEON-NOSVE-NEXT: umov w4, v0.b[12] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: umov w9, v1.b[0] +; NONEON-NOSVE-NEXT: umov w5, v0.b[13] +; NONEON-NOSVE-NEXT: umov w6, v0.b[14] +; NONEON-NOSVE-NEXT: umov w1, v3.b[1] +; NONEON-NOSVE-NEXT: umov w7, v2.b[0] +; NONEON-NOSVE-NEXT: umov w19, v2.b[2] +; NONEON-NOSVE-NEXT: umov w20, v2.b[3] +; NONEON-NOSVE-NEXT: umov w21, v2.b[4] +; NONEON-NOSVE-NEXT: umov w22, v2.b[5] +; NONEON-NOSVE-NEXT: umov w23, v2.b[6] +; NONEON-NOSVE-NEXT: umov w24, v2.b[7] +; NONEON-NOSVE-NEXT: umov w25, v2.b[8] +; NONEON-NOSVE-NEXT: umov w26, v2.b[9] +; NONEON-NOSVE-NEXT: umov w27, v2.b[10] +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.b[2] +; NONEON-NOSVE-NEXT: udiv w11, w11, w10 +; NONEON-NOSVE-NEXT: umov w10, v1.b[3] +; NONEON-NOSVE-NEXT: fmov s5, w9 +; NONEON-NOSVE-NEXT: umov w9, v3.b[11] +; NONEON-NOSVE-NEXT: mov v5.b[1], w8 +; NONEON-NOSVE-NEXT: udiv w10, w12, w10 +; NONEON-NOSVE-NEXT: umov w12, v1.b[4] +; NONEON-NOSVE-NEXT: mov v5.b[2], w11 +; NONEON-NOSVE-NEXT: umov w11, v2.b[11] +; NONEON-NOSVE-NEXT: udiv w13, w13, w12 +; NONEON-NOSVE-NEXT: umov w12, v1.b[5] +; NONEON-NOSVE-NEXT: mov v5.b[3], w10 +; NONEON-NOSVE-NEXT: umov w10, v3.b[12] +; NONEON-NOSVE-NEXT: udiv w12, w14, w12 +; NONEON-NOSVE-NEXT: umov w14, v1.b[6] +; NONEON-NOSVE-NEXT: mov v5.b[4], w13 +; NONEON-NOSVE-NEXT: umov w13, v2.b[14] +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: umov w14, v1.b[7] +; NONEON-NOSVE-NEXT: umov w15, v0.b[7] +; NONEON-NOSVE-NEXT: mov v5.b[5], w12 +; NONEON-NOSVE-NEXT: umov w12, v2.b[13] +; NONEON-NOSVE-NEXT: udiv w14, w15, w14 +; NONEON-NOSVE-NEXT: umov w15, v1.b[8] +; NONEON-NOSVE-NEXT: mov v5.b[6], w16 +; NONEON-NOSVE-NEXT: udiv w18, w17, w15 +; NONEON-NOSVE-NEXT: umov w15, v1.b[9] +; NONEON-NOSVE-NEXT: umov w17, v0.b[9] +; NONEON-NOSVE-NEXT: mov v5.b[7], w14 +; NONEON-NOSVE-NEXT: udiv w17, w17, w15 +; NONEON-NOSVE-NEXT: umov w15, v1.b[10] +; NONEON-NOSVE-NEXT: mov v5.b[8], w18 +; NONEON-NOSVE-NEXT: udiv w15, w2, w15 +; NONEON-NOSVE-NEXT: umov w2, v1.b[11] +; NONEON-NOSVE-NEXT: mov v5.b[9], w17 +; NONEON-NOSVE-NEXT: udiv w2, w3, w2 +; NONEON-NOSVE-NEXT: umov w3, v1.b[12] +; NONEON-NOSVE-NEXT: mov v5.b[10], w15 +; NONEON-NOSVE-NEXT: udiv w3, w4, w3 +; NONEON-NOSVE-NEXT: umov w4, v1.b[13] +; NONEON-NOSVE-NEXT: mov v5.b[11], w2 +; NONEON-NOSVE-NEXT: udiv w4, w5, w4 +; NONEON-NOSVE-NEXT: umov w5, v1.b[14] +; NONEON-NOSVE-NEXT: mov v5.b[12], w3 +; NONEON-NOSVE-NEXT: udiv w5, w6, w5 +; NONEON-NOSVE-NEXT: umov w6, v2.b[1] +; NONEON-NOSVE-NEXT: mov v5.b[13], w4 +; NONEON-NOSVE-NEXT: udiv w1, w6, w1 +; NONEON-NOSVE-NEXT: umov w6, v3.b[0] +; NONEON-NOSVE-NEXT: mov v5.b[14], w5 +; NONEON-NOSVE-NEXT: udiv w6, w7, w6 +; NONEON-NOSVE-NEXT: umov w7, v3.b[2] +; NONEON-NOSVE-NEXT: udiv w7, w19, w7 +; NONEON-NOSVE-NEXT: umov w19, v3.b[3] +; NONEON-NOSVE-NEXT: fmov s4, w6 +; NONEON-NOSVE-NEXT: mov v4.b[1], w1 +; NONEON-NOSVE-NEXT: udiv w19, w20, w19 +; NONEON-NOSVE-NEXT: umov w20, v3.b[4] +; NONEON-NOSVE-NEXT: mov v4.b[2], w7 +; NONEON-NOSVE-NEXT: udiv w20, w21, w20 +; NONEON-NOSVE-NEXT: umov w21, v3.b[5] +; NONEON-NOSVE-NEXT: mov v4.b[3], w19 +; NONEON-NOSVE-NEXT: udiv w21, w22, w21 +; NONEON-NOSVE-NEXT: umov w22, v3.b[6] +; NONEON-NOSVE-NEXT: mov v4.b[4], w20 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w22, w23, w22 +; NONEON-NOSVE-NEXT: umov w23, v3.b[7] +; NONEON-NOSVE-NEXT: mov v4.b[5], w21 +; NONEON-NOSVE-NEXT: udiv w23, w24, w23 +; NONEON-NOSVE-NEXT: umov w24, v3.b[8] +; NONEON-NOSVE-NEXT: mov v4.b[6], w22 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w24, w25, w24 +; NONEON-NOSVE-NEXT: umov w25, v3.b[9] +; NONEON-NOSVE-NEXT: mov v4.b[7], w23 +; NONEON-NOSVE-NEXT: udiv w25, w26, w25 +; NONEON-NOSVE-NEXT: umov w26, v3.b[10] +; NONEON-NOSVE-NEXT: mov v4.b[8], w24 +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w8, w27, w26 +; NONEON-NOSVE-NEXT: mov v4.b[9], w25 +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w9, w11, w9 +; NONEON-NOSVE-NEXT: umov w11, v2.b[12] +; NONEON-NOSVE-NEXT: mov v4.b[10], w8 +; NONEON-NOSVE-NEXT: umov w8, v3.b[15] +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: umov w11, v3.b[13] +; NONEON-NOSVE-NEXT: mov v4.b[11], w9 +; NONEON-NOSVE-NEXT: umov w9, v1.b[15] +; NONEON-NOSVE-NEXT: udiv w11, w12, w11 +; NONEON-NOSVE-NEXT: umov w12, v3.b[14] +; NONEON-NOSVE-NEXT: mov v4.b[12], w10 +; NONEON-NOSVE-NEXT: umov w10, v0.b[15] +; NONEON-NOSVE-NEXT: udiv w12, w13, w12 +; NONEON-NOSVE-NEXT: umov w13, v2.b[15] +; NONEON-NOSVE-NEXT: mov v4.b[13], w11 +; NONEON-NOSVE-NEXT: udiv w8, w13, w8 +; NONEON-NOSVE-NEXT: mov v4.b[14], w12 +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: mov v4.b[15], w8 +; NONEON-NOSVE-NEXT: mov v5.b[15], w9 +; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: ldr x27, [sp], #80 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = udiv <32 x i8> %op1, %op2 @@ -563,6 +1419,22 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: fmov w9, s0 +; NONEON-NOSVE-NEXT: mov w10, v0.s[1] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: mov w9, v1.s[1] +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: mov v0.s[1], w9 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = udiv <2 x i16> %op1, %op2 ret <2 x i16> %res } @@ -579,6 +1451,29 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: umov w8, v1.h[1] +; NONEON-NOSVE-NEXT: umov w9, v0.h[1] +; NONEON-NOSVE-NEXT: umov w10, v0.h[0] +; NONEON-NOSVE-NEXT: umov w11, v0.h[2] +; NONEON-NOSVE-NEXT: umov w12, v0.h[3] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: umov w9, v1.h[0] +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.h[2] +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: umov w11, v1.h[3] +; NONEON-NOSVE-NEXT: fmov s0, w9 +; NONEON-NOSVE-NEXT: mov v0.h[1], w8 +; NONEON-NOSVE-NEXT: udiv w8, w12, w11 +; NONEON-NOSVE-NEXT: mov v0.h[2], w10 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = udiv <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -605,6 +1500,43 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umov w8, v1.h[1] +; NONEON-NOSVE-NEXT: umov w9, v0.h[1] +; NONEON-NOSVE-NEXT: umov w10, v0.h[0] +; NONEON-NOSVE-NEXT: umov w11, v0.h[2] +; NONEON-NOSVE-NEXT: umov w12, v0.h[3] +; NONEON-NOSVE-NEXT: umov w13, v0.h[4] +; NONEON-NOSVE-NEXT: umov w14, v0.h[5] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: umov w9, v1.h[0] +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.h[2] +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: umov w11, v1.h[3] +; NONEON-NOSVE-NEXT: fmov s2, w9 +; NONEON-NOSVE-NEXT: umov w9, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[1], w8 +; NONEON-NOSVE-NEXT: udiv w11, w12, w11 +; NONEON-NOSVE-NEXT: umov w12, v1.h[4] +; NONEON-NOSVE-NEXT: mov v2.h[2], w10 +; NONEON-NOSVE-NEXT: umov w10, v0.h[6] +; NONEON-NOSVE-NEXT: udiv w12, w13, w12 +; NONEON-NOSVE-NEXT: umov w13, v1.h[5] +; NONEON-NOSVE-NEXT: mov v2.h[3], w11 +; NONEON-NOSVE-NEXT: umov w11, v0.h[7] +; NONEON-NOSVE-NEXT: udiv w8, w14, w13 +; NONEON-NOSVE-NEXT: mov v2.h[4], w12 +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.h[7] +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: udiv w8, w11, w10 +; NONEON-NOSVE-NEXT: mov v2.h[6], w9 +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = udiv <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -645,6 +1577,79 @@ define void @udiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: splice z3.h, p0, z3.h, z1.h ; CHECK-NEXT: stp q3, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: umov w8, v1.h[1] +; NONEON-NOSVE-NEXT: umov w9, v0.h[1] +; NONEON-NOSVE-NEXT: umov w10, v0.h[0] +; NONEON-NOSVE-NEXT: umov w11, v0.h[2] +; NONEON-NOSVE-NEXT: umov w12, v0.h[3] +; NONEON-NOSVE-NEXT: umov w13, v0.h[4] +; NONEON-NOSVE-NEXT: umov w14, v0.h[5] +; NONEON-NOSVE-NEXT: umov w15, v0.h[6] +; NONEON-NOSVE-NEXT: umov w16, v2.h[1] +; NONEON-NOSVE-NEXT: umov w17, v2.h[0] +; NONEON-NOSVE-NEXT: umov w18, v2.h[2] +; NONEON-NOSVE-NEXT: umov w1, v2.h[3] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: umov w9, v1.h[0] +; NONEON-NOSVE-NEXT: umov w2, v2.h[4] +; NONEON-NOSVE-NEXT: umov w3, v2.h[5] +; NONEON-NOSVE-NEXT: umov w4, v2.h[6] +; NONEON-NOSVE-NEXT: udiv w10, w10, w9 +; NONEON-NOSVE-NEXT: umov w9, v1.h[2] +; NONEON-NOSVE-NEXT: udiv w9, w11, w9 +; NONEON-NOSVE-NEXT: umov w11, v1.h[3] +; NONEON-NOSVE-NEXT: fmov s5, w10 +; NONEON-NOSVE-NEXT: umov w10, v3.h[7] +; NONEON-NOSVE-NEXT: mov v5.h[1], w8 +; NONEON-NOSVE-NEXT: udiv w11, w12, w11 +; NONEON-NOSVE-NEXT: umov w12, v1.h[4] +; NONEON-NOSVE-NEXT: mov v5.h[2], w9 +; NONEON-NOSVE-NEXT: umov w9, v2.h[7] +; NONEON-NOSVE-NEXT: udiv w12, w13, w12 +; NONEON-NOSVE-NEXT: umov w13, v1.h[5] +; NONEON-NOSVE-NEXT: mov v5.h[3], w11 +; NONEON-NOSVE-NEXT: umov w11, v0.h[7] +; NONEON-NOSVE-NEXT: udiv w13, w14, w13 +; NONEON-NOSVE-NEXT: umov w14, v1.h[6] +; NONEON-NOSVE-NEXT: mov v5.h[4], w12 +; NONEON-NOSVE-NEXT: udiv w14, w15, w14 +; NONEON-NOSVE-NEXT: umov w15, v3.h[1] +; NONEON-NOSVE-NEXT: mov v5.h[5], w13 +; NONEON-NOSVE-NEXT: udiv w15, w16, w15 +; NONEON-NOSVE-NEXT: umov w16, v3.h[0] +; NONEON-NOSVE-NEXT: mov v5.h[6], w14 +; NONEON-NOSVE-NEXT: udiv w16, w17, w16 +; NONEON-NOSVE-NEXT: umov w17, v3.h[2] +; NONEON-NOSVE-NEXT: udiv w17, w18, w17 +; NONEON-NOSVE-NEXT: umov w18, v3.h[3] +; NONEON-NOSVE-NEXT: fmov s4, w16 +; NONEON-NOSVE-NEXT: mov v4.h[1], w15 +; NONEON-NOSVE-NEXT: udiv w18, w1, w18 +; NONEON-NOSVE-NEXT: umov w1, v3.h[4] +; NONEON-NOSVE-NEXT: mov v4.h[2], w17 +; NONEON-NOSVE-NEXT: udiv w1, w2, w1 +; NONEON-NOSVE-NEXT: umov w2, v3.h[5] +; NONEON-NOSVE-NEXT: mov v4.h[3], w18 +; NONEON-NOSVE-NEXT: udiv w2, w3, w2 +; NONEON-NOSVE-NEXT: umov w3, v3.h[6] +; NONEON-NOSVE-NEXT: mov v4.h[4], w1 +; NONEON-NOSVE-NEXT: udiv w8, w4, w3 +; NONEON-NOSVE-NEXT: mov v4.h[5], w2 +; NONEON-NOSVE-NEXT: udiv w9, w9, w10 +; NONEON-NOSVE-NEXT: umov w10, v1.h[7] +; NONEON-NOSVE-NEXT: mov v4.h[6], w8 +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: mov v4.h[7], w9 +; NONEON-NOSVE-NEXT: mov v5.h[7], w10 +; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = udiv <16 x i16> %op1, %op2 @@ -661,6 +1666,21 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: fmov w9, s0 +; NONEON-NOSVE-NEXT: mov w10, v0.s[1] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: mov w9, v1.s[1] +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: mov v0.s[1], w9 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = udiv <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -674,6 +1694,26 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, v1.s[1] +; NONEON-NOSVE-NEXT: mov w9, v0.s[1] +; NONEON-NOSVE-NEXT: fmov w10, s0 +; NONEON-NOSVE-NEXT: mov w11, v0.s[2] +; NONEON-NOSVE-NEXT: mov w12, v0.s[3] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: fmov w9, s1 +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: mov w10, v1.s[2] +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: mov w11, v1.s[3] +; NONEON-NOSVE-NEXT: fmov s0, w9 +; NONEON-NOSVE-NEXT: mov v0.s[1], w8 +; NONEON-NOSVE-NEXT: udiv w8, w12, w11 +; NONEON-NOSVE-NEXT: mov v0.s[2], w10 +; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ret %res = udiv <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -689,6 +1729,45 @@ define void @udiv_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: mov w9, v0.s[1] +; NONEON-NOSVE-NEXT: fmov w10, s0 +; NONEON-NOSVE-NEXT: mov w11, v0.s[2] +; NONEON-NOSVE-NEXT: mov w8, v1.s[1] +; NONEON-NOSVE-NEXT: mov w12, v2.s[1] +; NONEON-NOSVE-NEXT: fmov w13, s2 +; NONEON-NOSVE-NEXT: mov w14, v2.s[2] +; NONEON-NOSVE-NEXT: mov w15, v2.s[3] +; NONEON-NOSVE-NEXT: mov w16, v0.s[3] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: fmov w9, s1 +; NONEON-NOSVE-NEXT: udiv w9, w10, w9 +; NONEON-NOSVE-NEXT: mov w10, v1.s[2] +; NONEON-NOSVE-NEXT: udiv w10, w11, w10 +; NONEON-NOSVE-NEXT: mov w11, v3.s[1] +; NONEON-NOSVE-NEXT: udiv w11, w12, w11 +; NONEON-NOSVE-NEXT: fmov w12, s3 +; NONEON-NOSVE-NEXT: udiv w12, w13, w12 +; NONEON-NOSVE-NEXT: mov w13, v3.s[2] +; NONEON-NOSVE-NEXT: udiv w13, w14, w13 +; NONEON-NOSVE-NEXT: mov w14, v3.s[3] +; NONEON-NOSVE-NEXT: fmov s0, w12 +; NONEON-NOSVE-NEXT: mov v0.s[1], w11 +; NONEON-NOSVE-NEXT: udiv w14, w15, w14 +; NONEON-NOSVE-NEXT: mov w15, v1.s[3] +; NONEON-NOSVE-NEXT: fmov s1, w9 +; NONEON-NOSVE-NEXT: mov v0.s[2], w13 +; NONEON-NOSVE-NEXT: mov v1.s[1], w8 +; NONEON-NOSVE-NEXT: mov v1.s[2], w10 +; NONEON-NOSVE-NEXT: udiv w8, w16, w15 +; NONEON-NOSVE-NEXT: mov v0.s[3], w14 +; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = udiv <8 x i32> %op1, %op2 @@ -705,6 +1784,16 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: udiv x8, x9, x8 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %res = udiv <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -718,6 +1807,18 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: mov x10, v0.d[1] +; NONEON-NOSVE-NEXT: udiv x8, x9, x8 +; NONEON-NOSVE-NEXT: mov x9, v1.d[1] +; NONEON-NOSVE-NEXT: udiv x9, x10, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: ret %res = udiv <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -733,6 +1834,29 @@ define void @udiv_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: mov x10, v2.d[1] +; NONEON-NOSVE-NEXT: fmov x11, d2 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: udiv x8, x9, x8 +; NONEON-NOSVE-NEXT: mov x9, v3.d[1] +; NONEON-NOSVE-NEXT: udiv x9, x10, x9 +; NONEON-NOSVE-NEXT: fmov x10, d3 +; NONEON-NOSVE-NEXT: udiv x10, x11, x10 +; NONEON-NOSVE-NEXT: mov x11, v1.d[1] +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: udiv x11, x12, x11 +; NONEON-NOSVE-NEXT: fmov d0, x10 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = udiv <4 x i64> %op1, %op2 @@ -778,6 +1902,27 @@ define void @udiv_constantsplat_v8i32(ptr %a) { ; SVE2-NEXT: lsr z0.s, z0.s, #6 ; SVE2-NEXT: stp q1, q0, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #8969 // =0x2309 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: movk w8, #22765, lsl #16 +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: umull2 v3.2d, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: umull v4.2d, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: umull2 v5.2d, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: umull v0.2d, v2.2s, v0.2s +; NONEON-NOSVE-NEXT: uzp2 v3.4s, v4.4s, v3.4s +; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v5.4s +; NONEON-NOSVE-NEXT: sub v1.4s, v1.4s, v3.4s +; NONEON-NOSVE-NEXT: sub v2.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: usra v3.4s, v1.4s, #1 +; NONEON-NOSVE-NEXT: usra v0.4s, v2.4s, #1 +; NONEON-NOSVE-NEXT: ushr v1.4s, v3.4s, #6 +; NONEON-NOSVE-NEXT: ushr v0.4s, v0.4s, #6 +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = udiv <8 x i32> %op1, store <8 x i32> %res, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll index e40668a8696ee2..9f8511b00c6ed1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll @@ -2,6 +2,7 @@ ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -26,6 +27,22 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) { ; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v8i1_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: shl v0.4s, v0.4s, #31 +; NONEON-NOSVE-NEXT: shl v1.4s, v1.4s, #31 +; NONEON-NOSVE-NEXT: cmlt v0.4s, v0.4s, #0 +; NONEON-NOSVE-NEXT: cmlt v1.4s, v1.4s, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = sext <8 x i1> %a to <8 x i32> store <8 x i32> %b, ptr %out ret void @@ -52,6 +69,22 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) { ; CHECK-NEXT: asr z0.d, z0.d, #61 ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v4i3_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: shl v0.2d, v0.2d, #61 +; NONEON-NOSVE-NEXT: shl v1.2d, v1.2d, #61 +; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #61 +; NONEON-NOSVE-NEXT: sshr v1.2d, v1.2d, #61 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = sext <4 x i3> %a to <4 x i64> store <4 x i64> %b, ptr %out ret void @@ -70,6 +103,17 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v16i8_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = sext <16 x i8> %a to <16 x i16> store <16 x i16>%b, ptr %out ret void @@ -91,6 +135,24 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v32i8_v32i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0 +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a %c = sext <32 x i8> %b to <32 x i16> @@ -112,6 +174,18 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) { ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v8i8_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = sext <8 x i8> %a to <8 x i32> store <8 x i32>%b, ptr %out ret void @@ -133,6 +207,25 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v16i8_v16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %b = sext <16 x i8> %a to <16 x i32> store <16 x i32> %b, ptr %out ret void @@ -167,6 +260,40 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v32i8_v32i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0 +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] +; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] +; NONEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0 +; NONEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q5, [x1] +; NONEON-NOSVE-NEXT: sshll v0.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: sshll v2.4s, v6.4h, #0 +; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] +; NONEON-NOSVE-NEXT: sshll v1.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: sshll v3.4s, v7.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] +; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a %c = sext <32 x i8> %b to <32 x i32> @@ -194,6 +321,22 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) { ; CHECK-NEXT: sxtb z0.d, p0/m, z0.d ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v4i8_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: shl v0.2d, v0.2d, #56 +; NONEON-NOSVE-NEXT: shl v1.2d, v1.2d, #56 +; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #56 +; NONEON-NOSVE-NEXT: sshr v1.2d, v1.2d, #56 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = sext <4 x i8> %a to <4 x i64> store <4 x i64>%b, ptr %out ret void @@ -216,6 +359,26 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v8i8_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %b = sext <8 x i8> %a to <8 x i64> store <8 x i64>%b, ptr %out ret void @@ -253,6 +416,41 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: stp q1, q4, [x0, #32] ; CHECK-NEXT: stp q0, q2, [x0, #96] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v16i8_v16i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-112]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 112 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #40] +; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #48] +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d5, [sp, #72] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #104] +; NONEON-NOSVE-NEXT: ldr d6, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] +; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 +; NONEON-NOSVE-NEXT: stp q1, q5, [x0, #64] +; NONEON-NOSVE-NEXT: sshll v1.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: sshll v2.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q4, [x0] +; NONEON-NOSVE-NEXT: sshll v0.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: sshll v3.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #96] +; NONEON-NOSVE-NEXT: stp q0, q3, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #112 +; NONEON-NOSVE-NEXT: ret %b = sext <16 x i8> %a to <16 x i64> store <16 x i64> %b, ptr %out ret void @@ -321,6 +519,73 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) { ; CHECK-NEXT: stp q0, q2, [x1, #224] ; CHECK-NEXT: stp q3, q1, [x1, #96] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v32i8_v32i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #224 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: sshll v5.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: sshll v6.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v3.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v4.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: stp q3, q5, [sp, #32] +; NONEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0 +; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #64] +; NONEON-NOSVE-NEXT: sshll v6.4s, v6.4h, #0 +; NONEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0 +; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] +; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v7.4s, v7.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q2, q5, [sp, #128] +; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ldr d19, [sp, #152] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #96] +; NONEON-NOSVE-NEXT: ldr d20, [sp, #136] +; NONEON-NOSVE-NEXT: stp q1, q4, [sp, #160] +; NONEON-NOSVE-NEXT: ldr d17, [sp, #104] +; NONEON-NOSVE-NEXT: ldr d21, [sp, #120] +; NONEON-NOSVE-NEXT: stp q7, q6, [sp, #192] +; NONEON-NOSVE-NEXT: sshll v6.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: sshll v19.2d, v19.2s, #0 +; NONEON-NOSVE-NEXT: ldr d16, [sp, #216] +; NONEON-NOSVE-NEXT: ldr d22, [sp, #200] +; NONEON-NOSVE-NEXT: ldr d23, [sp, #184] +; NONEON-NOSVE-NEXT: ldr d18, [sp, #168] +; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 +; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: sshll v16.2d, v16.2s, #0 +; NONEON-NOSVE-NEXT: stp q5, q19, [x1] +; NONEON-NOSVE-NEXT: sshll v5.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: sshll v7.2d, v22.2s, #0 +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: stp q6, q16, [x1, #128] +; NONEON-NOSVE-NEXT: sshll v6.2d, v23.2s, #0 +; NONEON-NOSVE-NEXT: stp q5, q7, [x1, #160] +; NONEON-NOSVE-NEXT: sshll v5.2d, v20.2s, #0 +; NONEON-NOSVE-NEXT: stp q4, q6, [x1, #192] +; NONEON-NOSVE-NEXT: sshll v4.2d, v21.2s, #0 +; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #32] +; NONEON-NOSVE-NEXT: sshll v2.2d, v17.2s, #0 +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #64] +; NONEON-NOSVE-NEXT: sshll v3.2d, v18.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #96] +; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #224] +; NONEON-NOSVE-NEXT: add sp, sp, #224 +; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a %c = sext <32 x i8> %b to <32 x i64> @@ -341,6 +606,17 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) { ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v8i16_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = sext <8 x i16> %a to <8 x i32> store <8 x i32>%b, ptr %out ret void @@ -361,6 +637,24 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v16i16_v16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a %c = sext <16 x i16> %b to <16 x i32> @@ -382,6 +676,18 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) { ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v4i16_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = sext <4 x i16> %a to <4 x i64> store <4 x i64>%b, ptr %out ret void @@ -403,6 +709,25 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v8i16_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %b = sext <8 x i16> %a to <8 x i64> store <8 x i64>%b, ptr %out ret void @@ -437,6 +762,40 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v16i16_v16i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] +; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] +; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q5, [x1] +; NONEON-NOSVE-NEXT: sshll v0.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: sshll v2.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] +; NONEON-NOSVE-NEXT: sshll v1.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: sshll v3.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] +; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a %c = sext <16 x i16> %b to <16 x i64> @@ -457,6 +816,17 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) { ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v4i32_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = sext <4 x i32> %a to <4 x i64> store <4 x i64>%b, ptr %out ret void @@ -477,6 +847,24 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sext_v8i32_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = add <8 x i32> %a, %a %c = sext <8 x i32> %b to <8 x i64> @@ -497,6 +885,17 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v16i8_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = zext <16 x i8> %a to <16 x i16> store <16 x i16>%b, ptr %out ret void @@ -518,6 +917,24 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v32i8_v32i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0 +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a %c = zext <32 x i8> %b to <32 x i16> @@ -539,6 +956,18 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) { ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v8i8_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = zext <8 x i8> %a to <8 x i32> store <8 x i32>%b, ptr %out ret void @@ -560,6 +989,25 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v16i8_v16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %b = zext <16 x i8> %a to <16 x i32> store <16 x i32> %b, ptr %out ret void @@ -594,6 +1042,40 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v32i8_v32i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0 +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] +; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] +; NONEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0 +; NONEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q5, [x1] +; NONEON-NOSVE-NEXT: ushll v0.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ushll v2.4s, v6.4h, #0 +; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] +; NONEON-NOSVE-NEXT: ushll v1.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: ushll v3.4s, v7.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] +; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a %c = zext <32 x i8> %b to <32 x i32> @@ -619,6 +1101,20 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) { ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v4i8_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = zext <4 x i8> %a to <4 x i64> store <4 x i64>%b, ptr %out ret void @@ -641,6 +1137,26 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v8i8_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %b = zext <8 x i8> %a to <8 x i64> store <8 x i64>%b, ptr %out ret void @@ -678,6 +1194,41 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: stp q1, q4, [x0, #32] ; CHECK-NEXT: stp q0, q2, [x0, #96] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v16i8_v16i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-112]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 112 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #40] +; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #48] +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d5, [sp, #72] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #104] +; NONEON-NOSVE-NEXT: ldr d6, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] +; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 +; NONEON-NOSVE-NEXT: stp q1, q5, [x0, #64] +; NONEON-NOSVE-NEXT: ushll v1.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ushll v2.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q4, [x0] +; NONEON-NOSVE-NEXT: ushll v0.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: ushll v3.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #96] +; NONEON-NOSVE-NEXT: stp q0, q3, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #112 +; NONEON-NOSVE-NEXT: ret %b = zext <16 x i8> %a to <16 x i64> store <16 x i64> %b, ptr %out ret void @@ -746,6 +1297,73 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) { ; CHECK-NEXT: stp q0, q2, [x1, #224] ; CHECK-NEXT: stp q3, q1, [x1, #96] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v32i8_v32i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #224 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ushll v5.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: ushll v6.8h, v1.8b, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v3.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v4.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: stp q3, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0 +; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #64] +; NONEON-NOSVE-NEXT: ushll v6.4s, v6.4h, #0 +; NONEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0 +; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] +; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v7.4s, v7.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q2, q5, [sp, #128] +; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ldr d19, [sp, #152] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #96] +; NONEON-NOSVE-NEXT: ldr d20, [sp, #136] +; NONEON-NOSVE-NEXT: stp q1, q4, [sp, #160] +; NONEON-NOSVE-NEXT: ldr d17, [sp, #104] +; NONEON-NOSVE-NEXT: ldr d21, [sp, #120] +; NONEON-NOSVE-NEXT: stp q7, q6, [sp, #192] +; NONEON-NOSVE-NEXT: ushll v6.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: ushll v19.2d, v19.2s, #0 +; NONEON-NOSVE-NEXT: ldr d16, [sp, #216] +; NONEON-NOSVE-NEXT: ldr d22, [sp, #200] +; NONEON-NOSVE-NEXT: ldr d23, [sp, #184] +; NONEON-NOSVE-NEXT: ldr d18, [sp, #168] +; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 +; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: ushll v16.2d, v16.2s, #0 +; NONEON-NOSVE-NEXT: stp q5, q19, [x1] +; NONEON-NOSVE-NEXT: ushll v5.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: ushll v7.2d, v22.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: stp q6, q16, [x1, #128] +; NONEON-NOSVE-NEXT: ushll v6.2d, v23.2s, #0 +; NONEON-NOSVE-NEXT: stp q5, q7, [x1, #160] +; NONEON-NOSVE-NEXT: ushll v5.2d, v20.2s, #0 +; NONEON-NOSVE-NEXT: stp q4, q6, [x1, #192] +; NONEON-NOSVE-NEXT: ushll v4.2d, v21.2s, #0 +; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #32] +; NONEON-NOSVE-NEXT: ushll v2.2d, v17.2s, #0 +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #64] +; NONEON-NOSVE-NEXT: ushll v3.2d, v18.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #96] +; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #224] +; NONEON-NOSVE-NEXT: add sp, sp, #224 +; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a %c = zext <32 x i8> %b to <32 x i64> @@ -766,6 +1384,17 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) { ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v8i16_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = zext <8 x i16> %a to <8 x i32> store <8 x i32>%b, ptr %out ret void @@ -786,6 +1415,24 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v16i16_v16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a %c = zext <16 x i16> %b to <16 x i32> @@ -807,6 +1454,18 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) { ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v4i16_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = zext <4 x i16> %a to <4 x i64> store <4 x i64>%b, ptr %out ret void @@ -828,6 +1487,25 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v8i16_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %b = zext <8 x i16> %a to <8 x i64> store <8 x i64>%b, ptr %out ret void @@ -862,6 +1540,40 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v16i16_v16i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] +; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] +; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q5, [x1] +; NONEON-NOSVE-NEXT: ushll v0.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ushll v2.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] +; NONEON-NOSVE-NEXT: ushll v1.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: ushll v3.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] +; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a %c = zext <16 x i16> %b to <16 x i64> @@ -882,6 +1594,17 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) { ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v4i32_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %b = zext <4 x i32> %a to <4 x i64> store <4 x i64>%b, ptr %out ret void @@ -902,6 +1625,24 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zext_v8i32_v8i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = add <8 x i32> %a, %a %c = zext <8 x i32> %b to <8 x i64> @@ -928,6 +1669,21 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) { ; SVE2-NEXT: mul z0.d, z1.d, z0.d ; SVE2-NEXT: str q0, [x1] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: extend_and_mul: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v1.2s, w0 +; NONEON-NOSVE-NEXT: fmov x10, d0 +; NONEON-NOSVE-NEXT: mov x8, v0.d[1] +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: fmov x11, d1 +; NONEON-NOSVE-NEXT: mov x9, v1.d[1] +; NONEON-NOSVE-NEXT: mul x10, x11, x10 +; NONEON-NOSVE-NEXT: mul x8, x9, x8 +; NONEON-NOSVE-NEXT: fmov d0, x10 +; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0 %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer %4 = zext <2 x i32> %broadcast.splat3 to <2 x i64> @@ -943,6 +1699,13 @@ define void @extend_no_mul(i32 %0, <2 x i64> %1, ptr %2) { ; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: extend_no_mul: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: dup v0.2s, w0 +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret entry: %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0 %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll index 54276bb4ba01d2..ade60b07150ce2 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -22,6 +23,15 @@ define void @add_v32i8(ptr %a) { ; CHECK-NEXT: add z1.b, z1.b, #7 // =0x7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: add v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -38,6 +48,16 @@ define void @add_v16i16(ptr %a) { ; CHECK-NEXT: add z1.h, z1.h, #15 // =0xf ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: add v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -54,6 +74,16 @@ define void @add_v8i32(ptr %a) { ; CHECK-NEXT: add z1.s, z1.s, #31 // =0x1f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -70,6 +100,16 @@ define void @add_v4i64(ptr %a) { ; CHECK-NEXT: add z1.d, z1.d, #63 // =0x3f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: add v1.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: add v0.2d, v2.2d, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -90,6 +130,15 @@ define void @and_v32i8(ptr %a) { ; CHECK-NEXT: and z1.b, z1.b, #0x7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -106,6 +155,16 @@ define void @and_v16i16(ptr %a) { ; CHECK-NEXT: and z1.h, z1.h, #0xf ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -122,6 +181,16 @@ define void @and_v8i32(ptr %a) { ; CHECK-NEXT: and z1.s, z1.s, #0x1f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -138,6 +207,16 @@ define void @and_v4i64(ptr %a) { ; CHECK-NEXT: and z1.d, z1.d, #0x3f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -158,6 +237,14 @@ define void @ashr_v32i8(ptr %a) { ; CHECK-NEXT: asr z1.b, z1.b, #7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 +; NONEON-NOSVE-NEXT: cmlt v1.16b, v1.16b, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -174,6 +261,14 @@ define void @ashr_v16i16(ptr %a) { ; CHECK-NEXT: asr z1.h, z1.h, #15 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmlt v0.8h, v0.8h, #0 +; NONEON-NOSVE-NEXT: cmlt v1.8h, v1.8h, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -190,6 +285,14 @@ define void @ashr_v8i32(ptr %a) { ; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmlt v0.4s, v0.4s, #0 +; NONEON-NOSVE-NEXT: cmlt v1.4s, v1.4s, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -206,6 +309,14 @@ define void @ashr_v4i64(ptr %a) { ; CHECK-NEXT: asr z1.d, z1.d, #63 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmlt v0.2d, v0.2d, #0 +; NONEON-NOSVE-NEXT: cmlt v1.2d, v1.2d, #0 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -229,6 +340,15 @@ define void @icmp_eq_v32i8(ptr %a) { ; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_eq_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmeq v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: cmeq v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -249,6 +369,16 @@ define void @icmp_sge_v16i16(ptr %a) { ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_sge_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: cmge v1.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: cmge v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -269,6 +399,16 @@ define void @icmp_sgt_v8i32(ptr %a) { ; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_sgt_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #-8 // =0xfffffff8 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: cmgt v1.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: cmgt v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 -8, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -289,6 +429,16 @@ define void @icmp_ult_v4i64(ptr %a) { ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: icmp_ult_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: cmhi v1.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: cmhi v0.2d, v0.2d, v2.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -310,6 +460,14 @@ define void @lshr_v32i8(ptr %a) { ; CHECK-NEXT: lsr z1.b, z1.b, #7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ushr v0.16b, v0.16b, #7 +; NONEON-NOSVE-NEXT: ushr v1.16b, v1.16b, #7 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -326,6 +484,14 @@ define void @lshr_v16i16(ptr %a) { ; CHECK-NEXT: lsr z1.h, z1.h, #15 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ushr v0.8h, v0.8h, #15 +; NONEON-NOSVE-NEXT: ushr v1.8h, v1.8h, #15 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -342,6 +508,14 @@ define void @lshr_v8i32(ptr %a) { ; CHECK-NEXT: lsr z1.s, z1.s, #31 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ushr v0.4s, v0.4s, #31 +; NONEON-NOSVE-NEXT: ushr v1.4s, v1.4s, #31 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -358,6 +532,14 @@ define void @lshr_v4i64(ptr %a) { ; CHECK-NEXT: lsr z1.d, z1.d, #63 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ushr v0.2d, v0.2d, #63 +; NONEON-NOSVE-NEXT: ushr v1.2d, v1.2d, #63 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -378,6 +560,15 @@ define void @mul_v32i8(ptr %a) { ; CHECK-NEXT: mul z1.b, z1.b, #7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: mul v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: mul v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -394,6 +585,16 @@ define void @mul_v16i16(ptr %a) { ; CHECK-NEXT: mul z1.h, z1.h, #15 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: mul v1.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: mul v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -410,6 +611,16 @@ define void @mul_v8i32(ptr %a) { ; CHECK-NEXT: mul z1.s, z1.s, #31 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: mul v1.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: mul v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -426,6 +637,28 @@ define void @mul_v4i64(ptr %a) { ; CHECK-NEXT: mul z1.d, z1.d, #63 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: mul_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: fmov x10, d0 +; NONEON-NOSVE-NEXT: fmov x11, d1 +; NONEON-NOSVE-NEXT: mov x8, v0.d[1] +; NONEON-NOSVE-NEXT: mov x9, v1.d[1] +; NONEON-NOSVE-NEXT: lsl x12, x10, #6 +; NONEON-NOSVE-NEXT: lsl x13, x11, #6 +; NONEON-NOSVE-NEXT: lsl x14, x8, #6 +; NONEON-NOSVE-NEXT: sub x10, x12, x10 +; NONEON-NOSVE-NEXT: sub x11, x13, x11 +; NONEON-NOSVE-NEXT: lsl x12, x9, #6 +; NONEON-NOSVE-NEXT: fmov d0, x10 +; NONEON-NOSVE-NEXT: fmov d1, x11 +; NONEON-NOSVE-NEXT: sub x8, x14, x8 +; NONEON-NOSVE-NEXT: sub x9, x12, x9 +; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: mov v1.d[1], x9 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -446,6 +679,15 @@ define void @or_v32i8(ptr %a) { ; CHECK-NEXT: orr z1.b, z1.b, #0x7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -462,6 +704,16 @@ define void @or_v16i16(ptr %a) { ; CHECK-NEXT: orr z1.h, z1.h, #0xf ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -478,6 +730,16 @@ define void @or_v8i32(ptr %a) { ; CHECK-NEXT: orr z1.s, z1.s, #0x1f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -494,6 +756,16 @@ define void @or_v4i64(ptr %a) { ; CHECK-NEXT: orr z1.d, z1.d, #0x3f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -514,6 +786,14 @@ define void @shl_v32i8(ptr %a) { ; CHECK-NEXT: lsl z1.b, z1.b, #7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 +; NONEON-NOSVE-NEXT: shl v1.16b, v1.16b, #7 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -530,6 +810,14 @@ define void @shl_v16i16(ptr %a) { ; CHECK-NEXT: lsl z1.h, z1.h, #15 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: shl v0.8h, v0.8h, #15 +; NONEON-NOSVE-NEXT: shl v1.8h, v1.8h, #15 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -546,6 +834,14 @@ define void @shl_v8i32(ptr %a) { ; CHECK-NEXT: lsl z1.s, z1.s, #31 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: shl v0.4s, v0.4s, #31 +; NONEON-NOSVE-NEXT: shl v1.4s, v1.4s, #31 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -562,6 +858,14 @@ define void @shl_v4i64(ptr %a) { ; CHECK-NEXT: lsl z1.d, z1.d, #63 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: shl v0.2d, v0.2d, #63 +; NONEON-NOSVE-NEXT: shl v1.2d, v1.2d, #63 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -582,6 +886,15 @@ define void @smax_v32i8(ptr %a) { ; CHECK-NEXT: smax z1.b, z1.b, #7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smax v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: smax v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -598,6 +911,16 @@ define void @smax_v16i16(ptr %a) { ; CHECK-NEXT: smax z1.h, z1.h, #15 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: smax v1.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: smax v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -614,6 +937,16 @@ define void @smax_v8i32(ptr %a) { ; CHECK-NEXT: smax z1.s, z1.s, #31 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: smax v1.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: smax v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -630,6 +963,18 @@ define void @smax_v4i64(ptr %a) { ; CHECK-NEXT: smax z1.d, z1.d, #63 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: cmgt v3.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: cmgt v4.2d, v2.2d, v0.2d +; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b +; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -650,6 +995,15 @@ define void @smin_v32i8(ptr %a) { ; CHECK-NEXT: smin z1.b, z1.b, #7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smin v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: smin v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -666,6 +1020,16 @@ define void @smin_v16i16(ptr %a) { ; CHECK-NEXT: smin z1.h, z1.h, #15 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: smin v1.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: smin v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -682,6 +1046,16 @@ define void @smin_v8i32(ptr %a) { ; CHECK-NEXT: smin z1.s, z1.s, #31 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: smin v1.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: smin v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -698,6 +1072,18 @@ define void @smin_v4i64(ptr %a) { ; CHECK-NEXT: smin z1.d, z1.d, #63 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: cmgt v3.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: cmgt v4.2d, v0.2d, v2.2d +; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b +; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -718,6 +1104,15 @@ define void @sub_v32i8(ptr %a) { ; CHECK-NEXT: sub z1.b, z1.b, #7 // =0x7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: sub v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -734,6 +1129,16 @@ define void @sub_v16i16(ptr %a) { ; CHECK-NEXT: sub z1.h, z1.h, #15 // =0xf ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: sub v1.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: sub v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -750,6 +1155,16 @@ define void @sub_v8i32(ptr %a) { ; CHECK-NEXT: sub z1.s, z1.s, #31 // =0x1f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: sub v1.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: sub v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -766,6 +1181,16 @@ define void @sub_v4i64(ptr %a) { ; CHECK-NEXT: sub z1.d, z1.d, #63 // =0x3f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sub_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: sub v1.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: sub v0.2d, v2.2d, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -786,6 +1211,15 @@ define void @umax_v32i8(ptr %a) { ; CHECK-NEXT: umax z1.b, z1.b, #7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umax v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: umax v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -802,6 +1236,16 @@ define void @umax_v16i16(ptr %a) { ; CHECK-NEXT: umax z1.h, z1.h, #15 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: umax v1.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: umax v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -818,6 +1262,16 @@ define void @umax_v8i32(ptr %a) { ; CHECK-NEXT: umax z1.s, z1.s, #31 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: umax v1.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: umax v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -834,6 +1288,18 @@ define void @umax_v4i64(ptr %a) { ; CHECK-NEXT: umax z1.d, z1.d, #63 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: cmhi v3.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: cmhi v4.2d, v2.2d, v0.2d +; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b +; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -854,6 +1320,15 @@ define void @umin_v32i8(ptr %a) { ; CHECK-NEXT: umin z1.b, z1.b, #7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umin v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: umin v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -870,6 +1345,16 @@ define void @umin_v16i16(ptr %a) { ; CHECK-NEXT: umin z1.h, z1.h, #15 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: umin v1.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: umin v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -886,6 +1371,16 @@ define void @umin_v8i32(ptr %a) { ; CHECK-NEXT: umin z1.s, z1.s, #31 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: umin v1.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: umin v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -902,6 +1397,18 @@ define void @umin_v4i64(ptr %a) { ; CHECK-NEXT: umin z1.d, z1.d, #63 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: cmhi v3.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: cmhi v4.2d, v0.2d, v2.2d +; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b +; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer @@ -922,6 +1429,15 @@ define void @xor_v32i8(ptr %a) { ; CHECK-NEXT: eor z1.b, z1.b, #0x7 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer @@ -938,6 +1454,16 @@ define void @xor_v16i16(ptr %a) { ; CHECK-NEXT: eor z1.h, z1.h, #0xf ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer @@ -954,6 +1480,16 @@ define void @xor_v8i32(ptr %a) { ; CHECK-NEXT: eor z1.s, z1.s, #0x1f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer @@ -970,6 +1506,16 @@ define void @xor_v4i64(ptr %a) { ; CHECK-NEXT: eor z1.d, z1.d, #0x3f ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll index 40824ba9ae9c5f..4fc7ec3a8439df 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -16,6 +17,11 @@ define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = and <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -28,6 +34,11 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = and <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -41,6 +52,15 @@ define void @and_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: and z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = and <32 x i8> %op1, %op2 @@ -56,6 +76,11 @@ define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = and <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -68,6 +93,11 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = and <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -81,6 +111,15 @@ define void @and_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: and z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = and <16 x i16> %op1, %op2 @@ -96,6 +135,11 @@ define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = and <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -108,6 +152,11 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = and <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -121,6 +170,15 @@ define void @and_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: and z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = and <8 x i32> %op1, %op2 @@ -136,6 +194,11 @@ define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = and <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -148,6 +211,11 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = and <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -161,6 +229,15 @@ define void @and_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: and z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: and_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = and <4 x i64> %op1, %op2 @@ -180,6 +257,11 @@ define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = or <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -192,6 +274,11 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = or <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -205,6 +292,15 @@ define void @or_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: orr z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = or <32 x i8> %op1, %op2 @@ -220,6 +316,11 @@ define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = or <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -232,6 +333,11 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = or <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -245,6 +351,15 @@ define void @or_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: orr z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = or <16 x i16> %op1, %op2 @@ -260,6 +375,11 @@ define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = or <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -272,6 +392,11 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = or <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -285,6 +410,15 @@ define void @or_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: orr z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = or <8 x i32> %op1, %op2 @@ -300,6 +434,11 @@ define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = or <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -312,6 +451,11 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = or <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -325,6 +469,15 @@ define void @or_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: orr z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: or_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = or <4 x i64> %op1, %op2 @@ -344,6 +497,11 @@ define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = xor <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -356,6 +514,11 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = xor <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -369,6 +532,15 @@ define void @xor_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: eor z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = xor <32 x i8> %op1, %op2 @@ -384,6 +556,11 @@ define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = xor <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -396,6 +573,11 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = xor <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -409,6 +591,15 @@ define void @xor_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: eor z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = xor <16 x i16> %op1, %op2 @@ -424,6 +615,11 @@ define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = xor <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -436,6 +632,11 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = xor <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -449,6 +650,15 @@ define void @xor_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: eor z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = xor <8 x i32> %op1, %op2 @@ -464,6 +674,11 @@ define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = xor <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -476,6 +691,11 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = xor <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -489,6 +709,15 @@ define void @xor_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: eor z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: xor_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = xor <4 x i64> %op1, %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll index 74ee5482a60c41..b9c859a58611e8 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -17,6 +18,11 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smax v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res } @@ -30,6 +36,11 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smax v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res } @@ -45,6 +56,15 @@ define void @smax_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: smax z1.b, p0/m, z1.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smax v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: smax v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2) @@ -61,6 +81,11 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smax v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res } @@ -74,6 +99,11 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smax v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res } @@ -89,6 +119,15 @@ define void @smax_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: smax z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smax v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: smax v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2) @@ -105,6 +144,11 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smax v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res } @@ -118,6 +162,11 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smax v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res } @@ -133,6 +182,15 @@ define void @smax_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: smax z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smax v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: smax v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2) @@ -150,6 +208,12 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmgt d2, d0, d1 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res } @@ -164,6 +228,12 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmgt v2.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res } @@ -179,6 +249,18 @@ define void @smax_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: smax z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smax_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmgt v4.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: cmgt v5.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2) @@ -199,6 +281,11 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smin v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res } @@ -212,6 +299,11 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smin v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res } @@ -227,6 +319,15 @@ define void @smin_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: smin z1.b, p0/m, z1.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smin v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: smin v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2) @@ -243,6 +344,11 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smin v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res } @@ -256,6 +362,11 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smin v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res } @@ -271,6 +382,15 @@ define void @smin_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: smin z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smin v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: smin v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2) @@ -287,6 +407,11 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smin v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res } @@ -300,6 +425,11 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smin v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res } @@ -315,6 +445,15 @@ define void @smin_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: smin z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smin v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: smin v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2) @@ -332,6 +471,12 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmgt d2, d1, d0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res } @@ -346,6 +491,12 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res } @@ -361,6 +512,18 @@ define void @smin_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: smin z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smin_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmgt v4.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: cmgt v5.2d, v3.2d, v2.2d +; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2) @@ -381,6 +544,11 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umax v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res } @@ -394,6 +562,11 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umax v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res } @@ -409,6 +582,15 @@ define void @umax_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: umax z1.b, p0/m, z1.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umax v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: umax v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2) @@ -425,6 +607,11 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umax v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res } @@ -438,6 +625,11 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umax v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res } @@ -453,6 +645,15 @@ define void @umax_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: umax z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umax v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: umax v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2) @@ -469,6 +670,11 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umax v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res } @@ -482,6 +688,11 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umax v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res } @@ -497,6 +708,15 @@ define void @umax_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: umax z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umax v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: umax v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2) @@ -514,6 +734,12 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmhi d2, d0, d1 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res } @@ -528,6 +754,12 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmhi v2.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res } @@ -543,6 +775,18 @@ define void @umax_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: umax z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umax_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmhi v4.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: cmhi v5.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2) @@ -563,6 +807,11 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umin v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res } @@ -576,6 +825,11 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umin v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res } @@ -591,6 +845,15 @@ define void @umin_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: umin z1.b, p0/m, z1.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umin v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: umin v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2) @@ -607,6 +870,11 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umin v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res } @@ -620,6 +888,11 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umin v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res } @@ -635,6 +908,15 @@ define void @umin_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: umin z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umin v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: umin v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2) @@ -651,6 +933,11 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umin v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res } @@ -664,6 +951,11 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umin v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res } @@ -679,6 +971,15 @@ define void @umin_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: umin z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umin v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: umin v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2) @@ -696,6 +997,12 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmhi d2, d1, d0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res } @@ -710,6 +1017,12 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res } @@ -725,6 +1038,18 @@ define void @umin_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: umin z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umin_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: cmhi v4.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: cmhi v5.2d, v3.2d, v2.2d +; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll index 3ff6983210a0a3..3a03de3442d581 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64 +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -20,6 +21,12 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { ; NO-FA64-NEXT: mad z0.b, p0/m, z1.b, z2.b ; NO-FA64-NEXT: // kill: def $d0 killed $d0 killed $z0 ; NO-FA64-NEXT: ret +; +; NONEON-NOSVE-LABEL: mla8xi8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mla v2.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ret %tmp1 = mul <8 x i8> %A, %B; %tmp2 = add <8 x i8> %C, %tmp1; ret <8 x i8> %tmp2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll index 8917f43002daf9..1ed3d8fa39d8da 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -2,6 +2,7 @@ ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE ; This test only tests the legal types for a given vector width, as mulh nodes ; do not get generated for non-legal types. @@ -36,6 +37,16 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; SVE2-NEXT: lsr z0.h, z0.h, #4 ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 +; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 +; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ushr v0.4h, v0.4h, #4 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i16> undef, i16 4, i64 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer %1 = sext <4 x i8> %op1 to <4 x i16> @@ -63,6 +74,12 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; SVE2-NEXT: smulh z0.b, z0.b, z1.b ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smull v0.8h, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: shrn v0.8b, v0.8h, #8 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i16> undef, i16 8, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer %1 = sext <8 x i8> %op1 to <8 x i16> @@ -90,6 +107,13 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; SVE2-NEXT: smulh z0.b, z0.b, z1.b ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smull2 v2.8h, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: smull v0.8h, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %1 = sext <16 x i8> %op1 to <16 x i16> %2 = sext <16 x i8> %op2 to <16 x i16> %mul = mul <16 x i16> %1, %2 @@ -118,6 +142,19 @@ define void @smulh_v32i8(ptr %a, ptr %b) { ; SVE2-NEXT: smulh z1.b, z2.b, z3.b ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smull2 v4.8h, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: smull v0.8h, v1.8b, v0.8b +; NONEON-NOSVE-NEXT: smull2 v1.8h, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: smull v2.8h, v2.8b, v3.8b +; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v4.16b +; NONEON-NOSVE-NEXT: uzp2 v1.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %1 = sext <32 x i8> %op1 to <32 x i16> @@ -153,6 +190,16 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; SVE2-NEXT: lsr z0.s, z0.s, #16 ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16 +; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 +; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: ret %1 = sext <2 x i16> %op1 to <2 x i32> %2 = sext <2 x i16> %op2 to <2 x i32> %mul = mul <2 x i32> %1, %2 @@ -178,6 +225,12 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; SVE2-NEXT: smulh z0.h, z0.h, z1.h ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smull v0.4s, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: shrn v0.4h, v0.4s, #16 +; NONEON-NOSVE-NEXT: ret %1 = sext <4 x i16> %op1 to <4 x i32> %2 = sext <4 x i16> %op2 to <4 x i32> %mul = mul <4 x i32> %1, %2 @@ -203,6 +256,13 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; SVE2-NEXT: smulh z0.h, z0.h, z1.h ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smull2 v2.4s, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: smull v0.4s, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: ret %1 = sext <8 x i16> %op1 to <8 x i32> %2 = sext <8 x i16> %op2 to <8 x i32> %mul = mul <8 x i32> %1, %2 @@ -231,6 +291,19 @@ define void @smulh_v16i16(ptr %a, ptr %b) { ; SVE2-NEXT: smulh z1.h, z2.h, z3.h ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smull2 v4.4s, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: smull v0.4s, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: smull2 v1.4s, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: smull v2.4s, v2.4h, v3.4h +; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v4.8h +; NONEON-NOSVE-NEXT: uzp2 v1.8h, v2.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %1 = sext <16 x i16> %op1 to <16 x i32> @@ -259,6 +332,12 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; SVE2-NEXT: smulh z0.s, z0.s, z1.s ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smull v0.2d, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: shrn v0.2s, v0.2d, #32 +; NONEON-NOSVE-NEXT: ret %1 = sext <2 x i32> %op1 to <2 x i64> %2 = sext <2 x i32> %op2 to <2 x i64> %mul = mul <2 x i64> %1, %2 @@ -284,6 +363,13 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; SVE2-NEXT: smulh z0.s, z0.s, z1.s ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smull2 v2.2d, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: smull v0.2d, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; NONEON-NOSVE-NEXT: ret %1 = sext <4 x i32> %op1 to <4 x i64> %2 = sext <4 x i32> %op2 to <4 x i64> %mul = mul <4 x i64> %1, %2 @@ -312,6 +398,19 @@ define void @smulh_v8i32(ptr %a, ptr %b) { ; SVE2-NEXT: smulh z1.s, z2.s, z3.s ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: smull2 v4.2d, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: smull v0.2d, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: smull2 v1.2d, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: smull v2.2d, v2.2s, v3.2s +; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v4.4s +; NONEON-NOSVE-NEXT: uzp2 v1.4s, v2.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %1 = sext <8 x i32> %op1 to <8 x i64> @@ -340,6 +439,16 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; SVE2-NEXT: smulh z0.d, z0.d, z1.d ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: fmov x9, d1 +; NONEON-NOSVE-NEXT: smulh x8, x8, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <1 x i128> undef, i128 64, i128 0 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer %1 = sext <1 x i64> %op1 to <1 x i128> @@ -367,6 +476,19 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; SVE2-NEXT: smulh z0.d, z0.d, z1.d ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov x8, v0.d[1] +; NONEON-NOSVE-NEXT: mov x9, v1.d[1] +; NONEON-NOSVE-NEXT: fmov x10, d0 +; NONEON-NOSVE-NEXT: fmov x11, d1 +; NONEON-NOSVE-NEXT: smulh x10, x10, x11 +; NONEON-NOSVE-NEXT: smulh x8, x8, x9 +; NONEON-NOSVE-NEXT: fmov d0, x10 +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %1 = sext <2 x i64> %op1 to <2 x i128> %2 = sext <2 x i64> %op2 to <2 x i128> %mul = mul <2 x i128> %1, %2 @@ -395,6 +517,31 @@ define void @smulh_v4i64(ptr %a, ptr %b) { ; SVE2-NEXT: smulh z1.d, z2.d, z3.d ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: smulh_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: mov x11, v0.d[1] +; NONEON-NOSVE-NEXT: mov x14, v3.d[1] +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: mov x10, v1.d[1] +; NONEON-NOSVE-NEXT: mov x13, v2.d[1] +; NONEON-NOSVE-NEXT: fmov x12, d3 +; NONEON-NOSVE-NEXT: smulh x8, x8, x9 +; NONEON-NOSVE-NEXT: fmov x9, d2 +; NONEON-NOSVE-NEXT: smulh x10, x10, x11 +; NONEON-NOSVE-NEXT: smulh x9, x9, x12 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: smulh x11, x13, x14 +; NONEON-NOSVE-NEXT: fmov d1, x10 +; NONEON-NOSVE-NEXT: fmov d2, x9 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: fmov d3, x11 +; NONEON-NOSVE-NEXT: mov v2.d[1], v3.d[0] +; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %1 = sext <4 x i64> %op1 to <4 x i128> @@ -433,6 +580,15 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; SVE2-NEXT: lsr z0.h, z0.h, #4 ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ushr v0.4h, v0.4h, #4 +; NONEON-NOSVE-NEXT: ret %1 = zext <4 x i8> %op1 to <4 x i16> %2 = zext <4 x i8> %op2 to <4 x i16> %mul = mul <4 x i16> %1, %2 @@ -458,6 +614,12 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; SVE2-NEXT: umulh z0.b, z0.b, z1.b ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umull v0.8h, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: shrn v0.8b, v0.8h, #8 +; NONEON-NOSVE-NEXT: ret %1 = zext <8 x i8> %op1 to <8 x i16> %2 = zext <8 x i8> %op2 to <8 x i16> %mul = mul <8 x i16> %1, %2 @@ -483,6 +645,13 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; SVE2-NEXT: umulh z0.b, z0.b, z1.b ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umull2 v2.8h, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: umull v0.8h, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %1 = zext <16 x i8> %op1 to <16 x i16> %2 = zext <16 x i8> %op2 to <16 x i16> %mul = mul <16 x i16> %1, %2 @@ -511,6 +680,19 @@ define void @umulh_v32i8(ptr %a, ptr %b) { ; SVE2-NEXT: umulh z1.b, z2.b, z3.b ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umull2 v4.8h, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: umull v0.8h, v1.8b, v0.8b +; NONEON-NOSVE-NEXT: umull2 v1.8h, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: umull v2.8h, v2.8b, v3.8b +; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v4.16b +; NONEON-NOSVE-NEXT: uzp2 v1.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %1 = zext <32 x i8> %op1 to <32 x i16> @@ -545,6 +727,15 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; SVE2-NEXT: lsr z0.s, z0.s, #16 ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: ret %1 = zext <2 x i16> %op1 to <2 x i32> %2 = zext <2 x i16> %op2 to <2 x i32> %mul = mul <2 x i32> %1, %2 @@ -570,6 +761,12 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; SVE2-NEXT: umulh z0.h, z0.h, z1.h ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umull v0.4s, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: shrn v0.4h, v0.4s, #16 +; NONEON-NOSVE-NEXT: ret %1 = zext <4 x i16> %op1 to <4 x i32> %2 = zext <4 x i16> %op2 to <4 x i32> %mul = mul <4 x i32> %1, %2 @@ -595,6 +792,13 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; SVE2-NEXT: umulh z0.h, z0.h, z1.h ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umull2 v2.4s, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: umull v0.4s, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: ret %1 = zext <8 x i16> %op1 to <8 x i32> %2 = zext <8 x i16> %op2 to <8 x i32> %mul = mul <8 x i32> %1, %2 @@ -623,6 +827,19 @@ define void @umulh_v16i16(ptr %a, ptr %b) { ; SVE2-NEXT: umulh z1.h, z2.h, z3.h ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umull2 v4.4s, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: umull v0.4s, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: umull2 v1.4s, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: umull v2.4s, v2.4h, v3.4h +; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v4.8h +; NONEON-NOSVE-NEXT: uzp2 v1.8h, v2.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %1 = zext <16 x i16> %op1 to <16 x i32> @@ -651,6 +868,12 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; SVE2-NEXT: umulh z0.s, z0.s, z1.s ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umull v0.2d, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: shrn v0.2s, v0.2d, #32 +; NONEON-NOSVE-NEXT: ret %1 = zext <2 x i32> %op1 to <2 x i64> %2 = zext <2 x i32> %op2 to <2 x i64> %mul = mul <2 x i64> %1, %2 @@ -676,6 +899,13 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; SVE2-NEXT: umulh z0.s, z0.s, z1.s ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umull2 v2.2d, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: umull v0.2d, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; NONEON-NOSVE-NEXT: ret %1 = zext <4 x i32> %op1 to <4 x i64> %2 = zext <4 x i32> %op2 to <4 x i64> %mul = mul <4 x i64> %1, %2 @@ -704,6 +934,19 @@ define void @umulh_v8i32(ptr %a, ptr %b) { ; SVE2-NEXT: umulh z1.s, z2.s, z3.s ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: umull2 v4.2d, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: umull v0.2d, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: umull2 v1.2d, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: umull v2.2d, v2.2s, v3.2s +; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v4.4s +; NONEON-NOSVE-NEXT: uzp2 v1.4s, v2.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %insert = insertelement <8 x i64> undef, i64 32, i64 0 @@ -734,6 +977,16 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; SVE2-NEXT: umulh z0.d, z0.d, z1.d ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: fmov x9, d1 +; NONEON-NOSVE-NEXT: umulh x8, x8, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %1 = zext <1 x i64> %op1 to <1 x i128> %2 = zext <1 x i64> %op2 to <1 x i128> %mul = mul <1 x i128> %1, %2 @@ -759,6 +1012,19 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; SVE2-NEXT: umulh z0.d, z0.d, z1.d ; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov x8, v0.d[1] +; NONEON-NOSVE-NEXT: mov x9, v1.d[1] +; NONEON-NOSVE-NEXT: fmov x10, d0 +; NONEON-NOSVE-NEXT: fmov x11, d1 +; NONEON-NOSVE-NEXT: umulh x10, x10, x11 +; NONEON-NOSVE-NEXT: umulh x8, x8, x9 +; NONEON-NOSVE-NEXT: fmov d0, x10 +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: ret %1 = zext <2 x i64> %op1 to <2 x i128> %2 = zext <2 x i64> %op2 to <2 x i128> %mul = mul <2 x i128> %1, %2 @@ -787,6 +1053,31 @@ define void @umulh_v4i64(ptr %a, ptr %b) { ; SVE2-NEXT: umulh z1.d, z2.d, z3.d ; SVE2-NEXT: stp q0, q1, [x0] ; SVE2-NEXT: ret +; +; NONEON-NOSVE-LABEL: umulh_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: mov x11, v0.d[1] +; NONEON-NOSVE-NEXT: mov x14, v3.d[1] +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: mov x10, v1.d[1] +; NONEON-NOSVE-NEXT: mov x13, v2.d[1] +; NONEON-NOSVE-NEXT: fmov x12, d3 +; NONEON-NOSVE-NEXT: umulh x8, x8, x9 +; NONEON-NOSVE-NEXT: fmov x9, d2 +; NONEON-NOSVE-NEXT: umulh x10, x10, x11 +; NONEON-NOSVE-NEXT: umulh x9, x9, x12 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: umulh x11, x13, x14 +; NONEON-NOSVE-NEXT: fmov d1, x10 +; NONEON-NOSVE-NEXT: fmov d2, x9 +; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: fmov d3, x11 +; NONEON-NOSVE-NEXT: mov v2.d[1], v3.d[0] +; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %1 = zext <4 x i64> %op1 to <4 x i128> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll index 1123907f338993..ad75ba62e17cf8 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -17,6 +18,12 @@ define i8 @uaddv_v8i8(<8 x i8> %a) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: addv b0, v0.8b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) ret i8 %res } @@ -30,6 +37,12 @@ define i8 @uaddv_v16i8(<16 x i8> %a) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: addv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) ret i8 %res } @@ -44,6 +57,14 @@ define i8 @uaddv_v32i8(ptr %a) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: addv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op) ret i8 %res @@ -58,6 +79,12 @@ define i16 @uaddv_v4i16(<4 x i16> %a) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: addv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) ret i16 %res } @@ -71,6 +98,12 @@ define i16 @uaddv_v8i16(<8 x i16> %a) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: addv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a) ret i16 %res } @@ -85,6 +118,14 @@ define i16 @uaddv_v16i16(ptr %a) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: addv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op) ret i16 %res @@ -99,6 +140,12 @@ define i32 @uaddv_v2i32(<2 x i32> %a) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: addp v0.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) ret i32 %res } @@ -112,6 +159,12 @@ define i32 @uaddv_v4i32(<4 x i32> %a) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: addv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) ret i32 %res } @@ -126,6 +179,14 @@ define i32 @uaddv_v8i32(ptr %a) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: addv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op) ret i32 %res @@ -139,6 +200,12 @@ define i64 @uaddv_v2i64(<2 x i64> %a) { ; CHECK-NEXT: uaddv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: addp d0, v0.2d +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) ret i64 %res } @@ -152,6 +219,14 @@ define i64 @uaddv_v4i64(ptr %a) { ; CHECK-NEXT: uaddv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uaddv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: addp d0, v0.2d +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op) ret i64 %res @@ -169,6 +244,12 @@ define i8 @smaxv_v8i8(<8 x i8> %a) { ; CHECK-NEXT: smaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smaxv b0, v0.8b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a) ret i8 %res } @@ -181,6 +262,12 @@ define i8 @smaxv_v16i8(<16 x i8> %a) { ; CHECK-NEXT: smaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smaxv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a) ret i8 %res } @@ -194,6 +281,14 @@ define i8 @smaxv_v32i8(ptr %a) { ; CHECK-NEXT: smaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: smax v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: smaxv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op) ret i8 %res @@ -207,6 +302,12 @@ define i16 @smaxv_v4i16(<4 x i16> %a) { ; CHECK-NEXT: smaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smaxv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a) ret i16 %res } @@ -219,6 +320,12 @@ define i16 @smaxv_v8i16(<8 x i16> %a) { ; CHECK-NEXT: smaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smaxv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a) ret i16 %res } @@ -232,6 +339,14 @@ define i16 @smaxv_v16i16(ptr %a) { ; CHECK-NEXT: smaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: smax v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: smaxv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op) ret i16 %res @@ -245,6 +360,12 @@ define i32 @smaxv_v2i32(<2 x i32> %a) { ; CHECK-NEXT: smaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smaxp v0.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a) ret i32 %res } @@ -257,6 +378,12 @@ define i32 @smaxv_v4i32(<4 x i32> %a) { ; CHECK-NEXT: smaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smaxv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) ret i32 %res } @@ -270,6 +397,14 @@ define i32 @smaxv_v8i32(ptr %a) { ; CHECK-NEXT: smaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: smax v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: smaxv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op) ret i32 %res @@ -284,6 +419,17 @@ define i64 @smaxv_v2i64(<2 x i64> %a) { ; CHECK-NEXT: smaxv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: cmgt d2, d0, d1 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a) ret i64 %res } @@ -297,6 +443,20 @@ define i64 @smaxv_v4i64(ptr %a) { ; CHECK-NEXT: smaxv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: smaxv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: cmgt d2, d0, d1 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op) ret i64 %res @@ -314,6 +474,12 @@ define i8 @sminv_v8i8(<8 x i8> %a) { ; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sminv b0, v0.8b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a) ret i8 %res } @@ -326,6 +492,12 @@ define i8 @sminv_v16i8(<16 x i8> %a) { ; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sminv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a) ret i8 %res } @@ -339,6 +511,14 @@ define i8 @sminv_v32i8(ptr %a) { ; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: smin v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: sminv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op) ret i8 %res @@ -352,6 +532,12 @@ define i16 @sminv_v4i16(<4 x i16> %a) { ; CHECK-NEXT: sminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sminv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a) ret i16 %res } @@ -364,6 +550,12 @@ define i16 @sminv_v8i16(<8 x i16> %a) { ; CHECK-NEXT: sminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sminv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a) ret i16 %res } @@ -377,6 +569,14 @@ define i16 @sminv_v16i16(ptr %a) { ; CHECK-NEXT: sminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: smin v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: sminv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op) ret i16 %res @@ -390,6 +590,12 @@ define i32 @sminv_v2i32(<2 x i32> %a) { ; CHECK-NEXT: sminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sminp v0.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a) ret i32 %res } @@ -402,6 +608,12 @@ define i32 @sminv_v4i32(<4 x i32> %a) { ; CHECK-NEXT: sminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sminv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) ret i32 %res } @@ -415,6 +627,14 @@ define i32 @sminv_v8i32(ptr %a) { ; CHECK-NEXT: sminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: smin v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: sminv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op) ret i32 %res @@ -429,6 +649,17 @@ define i64 @sminv_v2i64(<2 x i64> %a) { ; CHECK-NEXT: sminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: cmgt d2, d1, d0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a) ret i64 %res } @@ -442,6 +673,20 @@ define i64 @sminv_v4i64(ptr %a) { ; CHECK-NEXT: sminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sminv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: cmgt d2, d1, d0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op) ret i64 %res @@ -459,6 +704,12 @@ define i8 @umaxv_v8i8(<8 x i8> %a) { ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umaxv b0, v0.8b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a) ret i8 %res } @@ -471,6 +722,12 @@ define i8 @umaxv_v16i8(<16 x i8> %a) { ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umaxv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a) ret i8 %res } @@ -484,6 +741,14 @@ define i8 @umaxv_v32i8(ptr %a) { ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: umax v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: umaxv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op) ret i8 %res @@ -497,6 +762,12 @@ define i16 @umaxv_v4i16(<4 x i16> %a) { ; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umaxv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a) ret i16 %res } @@ -509,6 +780,12 @@ define i16 @umaxv_v8i16(<8 x i16> %a) { ; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umaxv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a) ret i16 %res } @@ -522,6 +799,14 @@ define i16 @umaxv_v16i16(ptr %a) { ; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: umax v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: umaxv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op) ret i16 %res @@ -535,6 +820,12 @@ define i32 @umaxv_v2i32(<2 x i32> %a) { ; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umaxp v0.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a) ret i32 %res } @@ -547,6 +838,12 @@ define i32 @umaxv_v4i32(<4 x i32> %a) { ; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umaxv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) ret i32 %res } @@ -560,6 +857,14 @@ define i32 @umaxv_v8i32(ptr %a) { ; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: umax v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: umaxv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op) ret i32 %res @@ -574,6 +879,17 @@ define i64 @umaxv_v2i64(<2 x i64> %a) { ; CHECK-NEXT: umaxv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: cmhi d2, d0, d1 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a) ret i64 %res } @@ -587,6 +903,20 @@ define i64 @umaxv_v4i64(ptr %a) { ; CHECK-NEXT: umaxv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: umaxv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: cmhi d2, d0, d1 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op) ret i64 %res @@ -604,6 +934,12 @@ define i8 @uminv_v8i8(<8 x i8> %a) { ; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: uminv b0, v0.8b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a) ret i8 %res } @@ -616,6 +952,12 @@ define i8 @uminv_v16i8(<16 x i8> %a) { ; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: uminv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a) ret i8 %res } @@ -629,6 +971,14 @@ define i8 @uminv_v32i8(ptr %a) { ; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: umin v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: uminv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op) ret i8 %res @@ -642,6 +992,12 @@ define i16 @uminv_v4i16(<4 x i16> %a) { ; CHECK-NEXT: uminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: uminv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a) ret i16 %res } @@ -654,6 +1010,12 @@ define i16 @uminv_v8i16(<8 x i16> %a) { ; CHECK-NEXT: uminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: uminv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a) ret i16 %res } @@ -667,6 +1029,14 @@ define i16 @uminv_v16i16(ptr %a) { ; CHECK-NEXT: uminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: umin v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: uminv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op) ret i16 %res @@ -680,6 +1050,12 @@ define i32 @uminv_v2i32(<2 x i32> %a) { ; CHECK-NEXT: uminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: uminp v0.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a) ret i32 %res } @@ -692,6 +1068,12 @@ define i32 @uminv_v4i32(<4 x i32> %a) { ; CHECK-NEXT: uminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: uminv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) ret i32 %res } @@ -705,6 +1087,14 @@ define i32 @uminv_v8i32(ptr %a) { ; CHECK-NEXT: uminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: umin v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uminv s0, v0.4s +; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op) ret i32 %res @@ -719,6 +1109,17 @@ define i64 @uminv_v2i64(<2 x i64> %a) { ; CHECK-NEXT: uminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: cmhi d2, d1, d0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a) ret i64 %res } @@ -732,6 +1133,20 @@ define i64 @uminv_v4i64(ptr %a) { ; CHECK-NEXT: uminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uminv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: cmhi d2, d1, d0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op) ret i64 %res diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 4ae7586fca1692..99f8aef9f2b22d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -24,6 +25,35 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 +; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 +; NONEON-NOSVE-NEXT: smov w11, v1.h[0] +; NONEON-NOSVE-NEXT: smov w12, v0.h[0] +; NONEON-NOSVE-NEXT: smov w8, v1.h[1] +; NONEON-NOSVE-NEXT: smov w9, v0.h[1] +; NONEON-NOSVE-NEXT: smov w14, v1.h[2] +; NONEON-NOSVE-NEXT: smov w15, v0.h[2] +; NONEON-NOSVE-NEXT: smov w17, v1.h[3] +; NONEON-NOSVE-NEXT: smov w18, v0.h[3] +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: fmov s0, w11 +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: mov v0.h[1], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: mov v0.h[2], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = srem <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -53,6 +83,53 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: smov w11, v1.b[0] +; NONEON-NOSVE-NEXT: smov w12, v0.b[0] +; NONEON-NOSVE-NEXT: smov w8, v1.b[1] +; NONEON-NOSVE-NEXT: smov w9, v0.b[1] +; NONEON-NOSVE-NEXT: smov w14, v1.b[2] +; NONEON-NOSVE-NEXT: smov w15, v0.b[2] +; NONEON-NOSVE-NEXT: smov w17, v1.b[3] +; NONEON-NOSVE-NEXT: smov w18, v0.b[3] +; NONEON-NOSVE-NEXT: smov w1, v1.b[4] +; NONEON-NOSVE-NEXT: smov w2, v0.b[4] +; NONEON-NOSVE-NEXT: smov w4, v1.b[5] +; NONEON-NOSVE-NEXT: smov w5, v0.b[5] +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: smov w13, v1.b[7] +; NONEON-NOSVE-NEXT: fmov s2, w11 +; NONEON-NOSVE-NEXT: smov w11, v0.b[6] +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.b[6] +; NONEON-NOSVE-NEXT: mov v2.b[1], w8 +; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: smov w14, v0.b[7] +; NONEON-NOSVE-NEXT: mov v2.b[2], w8 +; NONEON-NOSVE-NEXT: sdiv w3, w2, w1 +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: mov v2.b[3], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w5, w4 +; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 +; NONEON-NOSVE-NEXT: mov v2.b[4], w8 +; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 +; NONEON-NOSVE-NEXT: mov v2.b[5], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w14, w13 +; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 +; NONEON-NOSVE-NEXT: mov v2.b[6], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 +; NONEON-NOSVE-NEXT: mov v2.b[7], w8 +; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ret %res = srem <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -102,6 +179,112 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: mls z0.b, p0/m, z3.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #-80]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: smov w11, v1.b[0] +; NONEON-NOSVE-NEXT: smov w12, v0.b[0] +; NONEON-NOSVE-NEXT: smov w8, v1.b[1] +; NONEON-NOSVE-NEXT: smov w9, v0.b[1] +; NONEON-NOSVE-NEXT: smov w14, v1.b[2] +; NONEON-NOSVE-NEXT: smov w15, v0.b[2] +; NONEON-NOSVE-NEXT: smov w17, v1.b[3] +; NONEON-NOSVE-NEXT: smov w18, v0.b[3] +; NONEON-NOSVE-NEXT: smov w1, v1.b[4] +; NONEON-NOSVE-NEXT: smov w2, v0.b[4] +; NONEON-NOSVE-NEXT: smov w4, v1.b[5] +; NONEON-NOSVE-NEXT: smov w5, v0.b[5] +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: smov w7, v1.b[6] +; NONEON-NOSVE-NEXT: smov w19, v0.b[6] +; NONEON-NOSVE-NEXT: smov w21, v1.b[7] +; NONEON-NOSVE-NEXT: smov w22, v0.b[7] +; NONEON-NOSVE-NEXT: smov w24, v1.b[8] +; NONEON-NOSVE-NEXT: smov w25, v0.b[8] +; NONEON-NOSVE-NEXT: smov w27, v1.b[9] +; NONEON-NOSVE-NEXT: smov w28, v0.b[9] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: smov w13, v1.b[11] +; NONEON-NOSVE-NEXT: fmov s2, w11 +; NONEON-NOSVE-NEXT: smov w11, v0.b[10] +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.b[10] +; NONEON-NOSVE-NEXT: mov v2.b[1], w8 +; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: smov w14, v0.b[11] +; NONEON-NOSVE-NEXT: smov w16, v1.b[12] +; NONEON-NOSVE-NEXT: mov v2.b[2], w8 +; NONEON-NOSVE-NEXT: sdiv w3, w2, w1 +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: smov w17, v0.b[12] +; NONEON-NOSVE-NEXT: smov w0, v1.b[13] +; NONEON-NOSVE-NEXT: mov v2.b[3], w8 +; NONEON-NOSVE-NEXT: sdiv w6, w5, w4 +; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 +; NONEON-NOSVE-NEXT: smov w1, v0.b[13] +; NONEON-NOSVE-NEXT: mov v2.b[4], w8 +; NONEON-NOSVE-NEXT: sdiv w20, w19, w7 +; NONEON-NOSVE-NEXT: msub w8, w6, w4, w5 +; NONEON-NOSVE-NEXT: mov v2.b[5], w8 +; NONEON-NOSVE-NEXT: sdiv w23, w22, w21 +; NONEON-NOSVE-NEXT: msub w8, w20, w7, w19 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v2.b[6], w8 +; NONEON-NOSVE-NEXT: sdiv w26, w25, w24 +; NONEON-NOSVE-NEXT: msub w8, w23, w21, w22 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v2.b[7], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w28, w27 +; NONEON-NOSVE-NEXT: msub w8, w26, w24, w25 +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v2.b[8], w8 +; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w27, w28 +; NONEON-NOSVE-NEXT: mov v2.b[9], w8 +; NONEON-NOSVE-NEXT: sdiv w15, w14, w13 +; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 +; NONEON-NOSVE-NEXT: smov w10, v1.b[14] +; NONEON-NOSVE-NEXT: smov w11, v0.b[14] +; NONEON-NOSVE-NEXT: mov v2.b[10], w8 +; NONEON-NOSVE-NEXT: sdiv w18, w17, w16 +; NONEON-NOSVE-NEXT: msub w8, w15, w13, w14 +; NONEON-NOSVE-NEXT: smov w13, v1.b[15] +; NONEON-NOSVE-NEXT: smov w14, v0.b[15] +; NONEON-NOSVE-NEXT: mov v2.b[11], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w1, w0 +; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 +; NONEON-NOSVE-NEXT: mov v2.b[12], w8 +; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w0, w1 +; NONEON-NOSVE-NEXT: mov v2.b[13], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w14, w13 +; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 +; NONEON-NOSVE-NEXT: mov v2.b[14], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 +; NONEON-NOSVE-NEXT: mov v2.b[15], w8 +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp], #80 // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ret %res = srem <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -189,6 +372,279 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mls z2.b, p0/m, z7.b, z4.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #320 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #272] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #288] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #304] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 320 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: str x0, [sp, #216] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w8, v1.b[1] +; NONEON-NOSVE-NEXT: smov w9, v0.b[1] +; NONEON-NOSVE-NEXT: smov w4, v3.b[1] +; NONEON-NOSVE-NEXT: smov w1, v2.b[1] +; NONEON-NOSVE-NEXT: smov w7, v3.b[7] +; NONEON-NOSVE-NEXT: smov w5, v2.b[7] +; NONEON-NOSVE-NEXT: smov w6, v3.b[8] +; NONEON-NOSVE-NEXT: smov w3, v2.b[8] +; NONEON-NOSVE-NEXT: smov w22, v3.b[9] +; NONEON-NOSVE-NEXT: smov w20, v2.b[9] +; NONEON-NOSVE-NEXT: smov w13, v3.b[0] +; NONEON-NOSVE-NEXT: smov w17, v3.b[3] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #100] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w8, v1.b[0] +; NONEON-NOSVE-NEXT: str w9, [sp, #108] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w9, v0.b[0] +; NONEON-NOSVE-NEXT: smov w14, v2.b[3] +; NONEON-NOSVE-NEXT: smov w15, v3.b[4] +; NONEON-NOSVE-NEXT: smov w12, v2.b[4] +; NONEON-NOSVE-NEXT: smov w2, v3.b[5] +; NONEON-NOSVE-NEXT: smov w18, v2.b[5] +; NONEON-NOSVE-NEXT: smov w0, v3.b[6] +; NONEON-NOSVE-NEXT: smov w16, v2.b[6] +; NONEON-NOSVE-NEXT: smov w21, v3.b[10] +; NONEON-NOSVE-NEXT: smov w19, v2.b[10] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w30, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: str w10, [sp, #116] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[2] +; NONEON-NOSVE-NEXT: smov w9, v0.b[2] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #44] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[3] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #52] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w9, v0.b[3] +; NONEON-NOSVE-NEXT: sdiv w26, w14, w17 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w11, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[4] +; NONEON-NOSVE-NEXT: smov w9, v0.b[4] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #60] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[5] +; NONEON-NOSVE-NEXT: smov w9, v0.b[5] +; NONEON-NOSVE-NEXT: str w8, [sp, #96] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w9, [sp, #104] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #68] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[6] +; NONEON-NOSVE-NEXT: smov w9, v0.b[6] +; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #112] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[7] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w9, v0.b[7] +; NONEON-NOSVE-NEXT: sdiv w25, w12, w15 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #132] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[8] +; NONEON-NOSVE-NEXT: smov w9, v0.b[8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #140] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[9] +; NONEON-NOSVE-NEXT: smov w9, v0.b[9] +; NONEON-NOSVE-NEXT: str w8, [sp, #148] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w9, [sp, #156] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w11, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[10] +; NONEON-NOSVE-NEXT: smov w9, v0.b[10] +; NONEON-NOSVE-NEXT: str w10, [sp, #128] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #204] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[11] +; NONEON-NOSVE-NEXT: smov w9, v0.b[11] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #212] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[12] +; NONEON-NOSVE-NEXT: smov w9, v0.b[12] +; NONEON-NOSVE-NEXT: str w8, [sp, #172] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w9, [sp, #180] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #200] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[13] +; NONEON-NOSVE-NEXT: smov w9, v0.b[13] +; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #164] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w11, v3.b[2] +; NONEON-NOSVE-NEXT: str w9, [sp, #176] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #188] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.b[14] +; NONEON-NOSVE-NEXT: smov w9, v0.b[14] +; NONEON-NOSVE-NEXT: str w8, [sp, #144] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w9, [sp, #152] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #184] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w9, v2.b[2] +; NONEON-NOSVE-NEXT: sdiv w8, w1, w4 +; NONEON-NOSVE-NEXT: str w10, [sp, #160] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w10, v2.b[0] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w8, w5, w7 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w8, w3, w6 +; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w8, w20, w22 +; NONEON-NOSVE-NEXT: sdiv w24, w10, w13 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w29, w8, [sp, #40] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w8, w8, w30, w29 +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fmov s4, w8 +; NONEON-NOSVE-NEXT: sdiv w23, w9, w11 +; NONEON-NOSVE-NEXT: msub w10, w24, w13, w10 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w24, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w13, w13, w4, w1 +; NONEON-NOSVE-NEXT: ldr w1, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w4, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: fmov s5, w10 +; NONEON-NOSVE-NEXT: msub w1, w1, w24, w4 +; NONEON-NOSVE-NEXT: mov v5.b[1], w13 +; NONEON-NOSVE-NEXT: mov v4.b[1], w1 +; NONEON-NOSVE-NEXT: ldr w1, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w8, w23, w11, w9 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w28, w18, w2 +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #52] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #272] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[2], w8 +; NONEON-NOSVE-NEXT: msub w8, w26, w17, w14 +; NONEON-NOSVE-NEXT: ldr w14, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w11, w10 +; NONEON-NOSVE-NEXT: ldr w17, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: smov w10, v3.b[11] +; NONEON-NOSVE-NEXT: smov w11, v2.b[11] +; NONEON-NOSVE-NEXT: mov v4.b[2], w9 +; NONEON-NOSVE-NEXT: mov v5.b[3], w8 +; NONEON-NOSVE-NEXT: msub w8, w25, w15, w12 +; NONEON-NOSVE-NEXT: ldp w13, w9, [sp, #76] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w27, w16, w0 +; NONEON-NOSVE-NEXT: ldr w15, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w14, w13 +; NONEON-NOSVE-NEXT: ldr w14, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[4], w8 +; NONEON-NOSVE-NEXT: msub w8, w28, w2, w18 +; NONEON-NOSVE-NEXT: ldr w2, [sp, #156] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[3], w9 +; NONEON-NOSVE-NEXT: ldp w12, w9, [sp, #64] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[5], w8 +; NONEON-NOSVE-NEXT: msub w8, w27, w0, w16 +; NONEON-NOSVE-NEXT: ldr w0, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w4, w19, w21 +; NONEON-NOSVE-NEXT: msub w9, w9, w14, w12 +; NONEON-NOSVE-NEXT: smov w12, v3.b[12] +; NONEON-NOSVE-NEXT: smov w14, v2.b[12] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[6], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[4], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w8, w8, w7, w5 +; NONEON-NOSVE-NEXT: ldr w5, [sp, #204] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w17, w15 +; NONEON-NOSVE-NEXT: ldr w17, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[7], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w13, w11, w10 +; NONEON-NOSVE-NEXT: mov v4.b[5], w9 +; NONEON-NOSVE-NEXT: ldp w16, w9, [sp, #88] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w8, w8, w6, w3 +; NONEON-NOSVE-NEXT: ldr w3, [sp, #148] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w17, w16 +; NONEON-NOSVE-NEXT: smov w16, v3.b[13] +; NONEON-NOSVE-NEXT: smov w17, v2.b[13] +; NONEON-NOSVE-NEXT: mov v5.b[8], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[6], w9 +; NONEON-NOSVE-NEXT: msub w8, w8, w22, w20 +; NONEON-NOSVE-NEXT: sdiv w15, w14, w12 +; NONEON-NOSVE-NEXT: ldp w18, w9, [sp, #136] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[9], w8 +; NONEON-NOSVE-NEXT: msub w8, w4, w21, w19 +; NONEON-NOSVE-NEXT: msub w9, w9, w0, w18 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #304] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #288] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[7], w9 +; NONEON-NOSVE-NEXT: mov v5.b[10], w8 +; NONEON-NOSVE-NEXT: msub w8, w13, w10, w11 +; NONEON-NOSVE-NEXT: ldp w0, w9, [sp, #124] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #196] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w13, [sp, #192] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w18, w17, w16 +; NONEON-NOSVE-NEXT: msub w9, w9, w1, w0 +; NONEON-NOSVE-NEXT: mov v5.b[11], w8 +; NONEON-NOSVE-NEXT: smov w0, v3.b[14] +; NONEON-NOSVE-NEXT: msub w10, w10, w13, w11 +; NONEON-NOSVE-NEXT: smov w1, v2.b[14] +; NONEON-NOSVE-NEXT: msub w8, w15, w12, w14 +; NONEON-NOSVE-NEXT: mov v4.b[8], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #164] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w15, w13, [sp, #168] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w3, w2 +; NONEON-NOSVE-NEXT: mov v5.b[12], w8 +; NONEON-NOSVE-NEXT: ldp w4, w3, [sp, #208] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w14, w12, [sp, #176] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[9], w9 +; NONEON-NOSVE-NEXT: sdiv w2, w1, w0 +; NONEON-NOSVE-NEXT: smov w9, v3.b[15] +; NONEON-NOSVE-NEXT: msub w3, w3, w5, w4 +; NONEON-NOSVE-NEXT: smov w4, v2.b[15] +; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 +; NONEON-NOSVE-NEXT: ldr w16, [sp, #144] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[10], w3 +; NONEON-NOSVE-NEXT: mov v5.b[13], w8 +; NONEON-NOSVE-NEXT: mov v4.b[11], w10 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #188] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w11, w4, w9 +; NONEON-NOSVE-NEXT: msub w8, w2, w0, w1 +; NONEON-NOSVE-NEXT: msub w10, w10, w13, w12 +; NONEON-NOSVE-NEXT: smov w12, v1.b[15] +; NONEON-NOSVE-NEXT: smov w13, v0.b[15] +; NONEON-NOSVE-NEXT: mov v5.b[14], w8 +; NONEON-NOSVE-NEXT: mov v4.b[12], w10 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #184] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w10, w10, w15, w14 +; NONEON-NOSVE-NEXT: ldr w15, [sp, #152] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w14, w13, w12 +; NONEON-NOSVE-NEXT: msub w8, w11, w9, w4 +; NONEON-NOSVE-NEXT: mov v4.b[13], w10 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #160] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[15], w8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #216] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w10, w10, w16, w15 +; NONEON-NOSVE-NEXT: mov v4.b[14], w10 +; NONEON-NOSVE-NEXT: msub w9, w14, w12, w13 +; NONEON-NOSVE-NEXT: mov v4.b[15], w9 +; NONEON-NOSVE-NEXT: stp q5, q4, [x8] +; NONEON-NOSVE-NEXT: add sp, sp, #320 +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = srem <32 x i8> %op1, %op2 @@ -210,6 +666,33 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: smov w11, v1.h[0] +; NONEON-NOSVE-NEXT: smov w12, v0.h[0] +; NONEON-NOSVE-NEXT: smov w8, v1.h[1] +; NONEON-NOSVE-NEXT: smov w9, v0.h[1] +; NONEON-NOSVE-NEXT: smov w14, v1.h[2] +; NONEON-NOSVE-NEXT: smov w15, v0.h[2] +; NONEON-NOSVE-NEXT: smov w17, v1.h[3] +; NONEON-NOSVE-NEXT: smov w18, v0.h[3] +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: fmov s0, w11 +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: mov v0.h[1], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: mov v0.h[2], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = srem <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -238,6 +721,51 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: smov w11, v1.h[0] +; NONEON-NOSVE-NEXT: smov w12, v0.h[0] +; NONEON-NOSVE-NEXT: smov w8, v1.h[1] +; NONEON-NOSVE-NEXT: smov w9, v0.h[1] +; NONEON-NOSVE-NEXT: smov w14, v1.h[2] +; NONEON-NOSVE-NEXT: smov w15, v0.h[2] +; NONEON-NOSVE-NEXT: smov w17, v1.h[3] +; NONEON-NOSVE-NEXT: smov w18, v0.h[3] +; NONEON-NOSVE-NEXT: smov w1, v1.h[4] +; NONEON-NOSVE-NEXT: smov w2, v0.h[4] +; NONEON-NOSVE-NEXT: smov w4, v1.h[5] +; NONEON-NOSVE-NEXT: smov w5, v0.h[5] +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: smov w13, v1.h[7] +; NONEON-NOSVE-NEXT: fmov s2, w11 +; NONEON-NOSVE-NEXT: smov w11, v0.h[6] +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: smov w10, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[1], w8 +; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: smov w14, v0.h[7] +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: sdiv w3, w2, w1 +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w5, w4 +; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w14, w13 +; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = srem <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -282,6 +810,139 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: mls z0.h, p0/m, z7.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #96] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #112] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: smov w8, v1.h[1] +; NONEON-NOSVE-NEXT: smov w9, v0.h[1] +; NONEON-NOSVE-NEXT: smov w20, v1.h[0] +; NONEON-NOSVE-NEXT: smov w21, v0.h[0] +; NONEON-NOSVE-NEXT: smov w19, v0.h[3] +; NONEON-NOSVE-NEXT: smov w5, v1.h[4] +; NONEON-NOSVE-NEXT: smov w2, v0.h[4] +; NONEON-NOSVE-NEXT: smov w1, v3.h[1] +; NONEON-NOSVE-NEXT: smov w23, v2.h[1] +; NONEON-NOSVE-NEXT: smov w25, v3.h[0] +; NONEON-NOSVE-NEXT: smov w26, v2.h[0] +; NONEON-NOSVE-NEXT: smov w6, v1.h[5] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w8, v1.h[2] +; NONEON-NOSVE-NEXT: smov w9, v0.h[2] +; NONEON-NOSVE-NEXT: smov w3, v0.h[5] +; NONEON-NOSVE-NEXT: smov w4, v1.h[6] +; NONEON-NOSVE-NEXT: smov w7, v0.h[6] +; NONEON-NOSVE-NEXT: smov w28, v3.h[2] +; NONEON-NOSVE-NEXT: smov w29, v2.h[2] +; NONEON-NOSVE-NEXT: smov w15, v3.h[3] +; NONEON-NOSVE-NEXT: smov w13, v2.h[3] +; NONEON-NOSVE-NEXT: smov w12, v3.h[4] +; NONEON-NOSVE-NEXT: smov w14, v3.h[5] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w11, w21, w20 +; NONEON-NOSVE-NEXT: str w10, [sp, #44] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: smov w8, v1.h[3] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w11, v2.h[4] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #4] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w20, w22, w20, w21 +; NONEON-NOSVE-NEXT: sdiv w9, w19, w8 +; NONEON-NOSVE-NEXT: str w10, [sp, #32] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w10, v3.h[6] +; NONEON-NOSVE-NEXT: fmov s5, w20 +; NONEON-NOSVE-NEXT: smov w20, v3.h[7] +; NONEON-NOSVE-NEXT: sdiv w8, w2, w5 +; NONEON-NOSVE-NEXT: sdiv w24, w23, w1 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sdiv w27, w26, w25 +; NONEON-NOSVE-NEXT: msub w1, w24, w1, w23 +; NONEON-NOSVE-NEXT: ldp w24, w23, [sp, #40] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w9, w3, w6 +; NONEON-NOSVE-NEXT: msub w21, w27, w25, w26 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w23, w23, w25, w24 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: fmov s4, w21 +; NONEON-NOSVE-NEXT: mov v5.h[1], w23 +; NONEON-NOSVE-NEXT: ldp w23, w21, [sp, #28] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.h[1], w1 +; NONEON-NOSVE-NEXT: sdiv w8, w7, w4 +; NONEON-NOSVE-NEXT: msub w21, w21, w25, w23 +; NONEON-NOSVE-NEXT: smov w23, v2.h[7] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.h[2], w21 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: sdiv w30, w29, w28 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: smov w9, v2.h[5] +; NONEON-NOSVE-NEXT: smov w8, v2.h[6] +; NONEON-NOSVE-NEXT: sdiv w18, w13, w15 +; NONEON-NOSVE-NEXT: msub w1, w30, w28, w29 +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.h[2], w1 +; NONEON-NOSVE-NEXT: sdiv w16, w11, w12 +; NONEON-NOSVE-NEXT: msub w13, w18, w15, w13 +; NONEON-NOSVE-NEXT: ldr w15, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w18, [sp] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w15, w15, w18, w19 +; NONEON-NOSVE-NEXT: mov v4.h[3], w13 +; NONEON-NOSVE-NEXT: smov w13, v1.h[7] +; NONEON-NOSVE-NEXT: mov v5.h[3], w15 +; NONEON-NOSVE-NEXT: smov w15, v0.h[7] +; NONEON-NOSVE-NEXT: sdiv w17, w9, w14 +; NONEON-NOSVE-NEXT: msub w11, w16, w12, w11 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #16] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w12, w12, w5, w2 +; NONEON-NOSVE-NEXT: mov v4.h[4], w11 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.h[4], w12 +; NONEON-NOSVE-NEXT: msub w11, w11, w6, w3 +; NONEON-NOSVE-NEXT: sdiv w24, w8, w10 +; NONEON-NOSVE-NEXT: msub w9, w17, w14, w9 +; NONEON-NOSVE-NEXT: mov v5.h[5], w11 +; NONEON-NOSVE-NEXT: mov v4.h[5], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w4, w7 +; NONEON-NOSVE-NEXT: sdiv w18, w23, w20 +; NONEON-NOSVE-NEXT: msub w8, w24, w10, w8 +; NONEON-NOSVE-NEXT: mov v5.h[6], w9 +; NONEON-NOSVE-NEXT: mov v4.h[6], w8 +; NONEON-NOSVE-NEXT: sdiv w12, w15, w13 +; NONEON-NOSVE-NEXT: msub w8, w18, w20, w23 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #96] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.h[7], w8 +; NONEON-NOSVE-NEXT: msub w9, w12, w13, w15 +; NONEON-NOSVE-NEXT: mov v5.h[7], w9 +; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #144 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = srem <16 x i16> %op1, %op2 @@ -300,6 +961,23 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: fmov w9, s0 +; NONEON-NOSVE-NEXT: mov w11, v1.s[1] +; NONEON-NOSVE-NEXT: mov w12, v0.s[1] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 +; NONEON-NOSVE-NEXT: mov v0.s[1], w9 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = srem <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -315,6 +993,30 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov w11, s1 +; NONEON-NOSVE-NEXT: fmov w12, s0 +; NONEON-NOSVE-NEXT: mov w8, v1.s[1] +; NONEON-NOSVE-NEXT: mov w9, v0.s[1] +; NONEON-NOSVE-NEXT: mov w14, v1.s[2] +; NONEON-NOSVE-NEXT: mov w15, v0.s[2] +; NONEON-NOSVE-NEXT: mov w17, v1.s[3] +; NONEON-NOSVE-NEXT: mov w18, v0.s[3] +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: fmov s0, w11 +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: mov v0.s[1], w8 +; NONEON-NOSVE-NEXT: sdiv w9, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: mov v0.s[2], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 +; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ret %res = srem <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -334,6 +1036,65 @@ define void @srem_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -48 +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: fmov w12, s0 +; NONEON-NOSVE-NEXT: fmov w3, s2 +; NONEON-NOSVE-NEXT: mov w9, v0.s[1] +; NONEON-NOSVE-NEXT: fmov w11, s1 +; NONEON-NOSVE-NEXT: fmov w2, s3 +; NONEON-NOSVE-NEXT: mov w8, v1.s[1] +; NONEON-NOSVE-NEXT: mov w17, v3.s[1] +; NONEON-NOSVE-NEXT: mov w18, v2.s[1] +; NONEON-NOSVE-NEXT: mov w14, v1.s[2] +; NONEON-NOSVE-NEXT: mov w15, v0.s[2] +; NONEON-NOSVE-NEXT: mov w5, v3.s[2] +; NONEON-NOSVE-NEXT: mov w6, v2.s[2] +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: mov w19, v3.s[3] +; NONEON-NOSVE-NEXT: mov w20, v2.s[3] +; NONEON-NOSVE-NEXT: mov w22, v1.s[3] +; NONEON-NOSVE-NEXT: mov w23, v0.s[3] +; NONEON-NOSVE-NEXT: sdiv w4, w3, w2 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: fmov s1, w11 +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w12, w4, w2, w3 +; NONEON-NOSVE-NEXT: fmov s0, w12 +; NONEON-NOSVE-NEXT: sdiv w1, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: mov v1.s[1], w8 +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w13, w1, w17, w18 +; NONEON-NOSVE-NEXT: mov v0.s[1], w13 +; NONEON-NOSVE-NEXT: sdiv w7, w6, w5 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: mov v1.s[2], w8 +; NONEON-NOSVE-NEXT: sdiv w21, w20, w19 +; NONEON-NOSVE-NEXT: msub w10, w7, w5, w6 +; NONEON-NOSVE-NEXT: mov v0.s[2], w10 +; NONEON-NOSVE-NEXT: sdiv w9, w23, w22 +; NONEON-NOSVE-NEXT: msub w10, w21, w19, w20 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v0.s[3], w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w22, w23 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = srem <8 x i32> %op1, %op2 @@ -352,6 +1113,17 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %res = srem <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -367,6 +1139,20 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: mov x11, v1.d[1] +; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 +; NONEON-NOSVE-NEXT: sdiv x13, x12, x11 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: ret %res = srem <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -386,6 +1172,33 @@ define void @srem_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: srem_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: fmov x15, d2 +; NONEON-NOSVE-NEXT: mov x12, v2.d[1] +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x14, d3 +; NONEON-NOSVE-NEXT: mov x11, v3.d[1] +; NONEON-NOSVE-NEXT: mov x17, v1.d[1] +; NONEON-NOSVE-NEXT: mov x18, v0.d[1] +; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 +; NONEON-NOSVE-NEXT: sdiv x16, x15, x14 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: sdiv x13, x12, x11 +; NONEON-NOSVE-NEXT: msub x10, x16, x14, x15 +; NONEON-NOSVE-NEXT: fmov d0, x10 +; NONEON-NOSVE-NEXT: sdiv x1, x18, x17 +; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: msub x11, x1, x17, x18 +; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = srem <4 x i64> %op1, %op2 @@ -413,6 +1226,41 @@ define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: umov w11, v1.h[0] +; NONEON-NOSVE-NEXT: umov w12, v0.h[0] +; NONEON-NOSVE-NEXT: umov w8, v1.h[1] +; NONEON-NOSVE-NEXT: umov w9, v0.h[1] +; NONEON-NOSVE-NEXT: umov w14, v1.h[2] +; NONEON-NOSVE-NEXT: umov w15, v0.h[2] +; NONEON-NOSVE-NEXT: umov w17, v1.h[3] +; NONEON-NOSVE-NEXT: umov w18, v0.h[3] +; NONEON-NOSVE-NEXT: and w11, w11, #0xff +; NONEON-NOSVE-NEXT: and w12, w12, #0xff +; NONEON-NOSVE-NEXT: and w8, w8, #0xff +; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: and w9, w9, #0xff +; NONEON-NOSVE-NEXT: and w14, w14, #0xff +; NONEON-NOSVE-NEXT: and w15, w15, #0xff +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: and w12, w17, #0xff +; NONEON-NOSVE-NEXT: and w13, w18, #0xff +; NONEON-NOSVE-NEXT: fmov s0, w11 +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: mov v0.h[1], w8 +; NONEON-NOSVE-NEXT: udiv w9, w13, w12 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: mov v0.h[2], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w12, w13 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = urem <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -442,6 +1290,53 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: umov w11, v1.b[0] +; NONEON-NOSVE-NEXT: umov w12, v0.b[0] +; NONEON-NOSVE-NEXT: umov w8, v1.b[1] +; NONEON-NOSVE-NEXT: umov w9, v0.b[1] +; NONEON-NOSVE-NEXT: umov w14, v1.b[2] +; NONEON-NOSVE-NEXT: umov w15, v0.b[2] +; NONEON-NOSVE-NEXT: umov w17, v1.b[3] +; NONEON-NOSVE-NEXT: umov w18, v0.b[3] +; NONEON-NOSVE-NEXT: umov w1, v1.b[4] +; NONEON-NOSVE-NEXT: umov w2, v0.b[4] +; NONEON-NOSVE-NEXT: umov w4, v1.b[5] +; NONEON-NOSVE-NEXT: umov w5, v0.b[5] +; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: umov w13, v1.b[7] +; NONEON-NOSVE-NEXT: fmov s2, w11 +; NONEON-NOSVE-NEXT: umov w11, v0.b[6] +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.b[6] +; NONEON-NOSVE-NEXT: mov v2.b[1], w8 +; NONEON-NOSVE-NEXT: udiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: umov w14, v0.b[7] +; NONEON-NOSVE-NEXT: mov v2.b[2], w8 +; NONEON-NOSVE-NEXT: udiv w3, w2, w1 +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: mov v2.b[3], w8 +; NONEON-NOSVE-NEXT: udiv w9, w5, w4 +; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 +; NONEON-NOSVE-NEXT: mov v2.b[4], w8 +; NONEON-NOSVE-NEXT: udiv w12, w11, w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 +; NONEON-NOSVE-NEXT: mov v2.b[5], w8 +; NONEON-NOSVE-NEXT: udiv w9, w14, w13 +; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 +; NONEON-NOSVE-NEXT: mov v2.b[6], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 +; NONEON-NOSVE-NEXT: mov v2.b[7], w8 +; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ret %res = urem <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -491,6 +1386,112 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: mls z0.b, p0/m, z3.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #-80]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: umov w11, v1.b[0] +; NONEON-NOSVE-NEXT: umov w12, v0.b[0] +; NONEON-NOSVE-NEXT: umov w8, v1.b[1] +; NONEON-NOSVE-NEXT: umov w9, v0.b[1] +; NONEON-NOSVE-NEXT: umov w14, v1.b[2] +; NONEON-NOSVE-NEXT: umov w15, v0.b[2] +; NONEON-NOSVE-NEXT: umov w17, v1.b[3] +; NONEON-NOSVE-NEXT: umov w18, v0.b[3] +; NONEON-NOSVE-NEXT: umov w1, v1.b[4] +; NONEON-NOSVE-NEXT: umov w2, v0.b[4] +; NONEON-NOSVE-NEXT: umov w4, v1.b[5] +; NONEON-NOSVE-NEXT: umov w5, v0.b[5] +; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: umov w7, v1.b[6] +; NONEON-NOSVE-NEXT: umov w19, v0.b[6] +; NONEON-NOSVE-NEXT: umov w21, v1.b[7] +; NONEON-NOSVE-NEXT: umov w22, v0.b[7] +; NONEON-NOSVE-NEXT: umov w24, v1.b[8] +; NONEON-NOSVE-NEXT: umov w25, v0.b[8] +; NONEON-NOSVE-NEXT: umov w27, v1.b[9] +; NONEON-NOSVE-NEXT: umov w28, v0.b[9] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: umov w13, v1.b[11] +; NONEON-NOSVE-NEXT: fmov s2, w11 +; NONEON-NOSVE-NEXT: umov w11, v0.b[10] +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.b[10] +; NONEON-NOSVE-NEXT: mov v2.b[1], w8 +; NONEON-NOSVE-NEXT: udiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: umov w14, v0.b[11] +; NONEON-NOSVE-NEXT: umov w16, v1.b[12] +; NONEON-NOSVE-NEXT: mov v2.b[2], w8 +; NONEON-NOSVE-NEXT: udiv w3, w2, w1 +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: umov w17, v0.b[12] +; NONEON-NOSVE-NEXT: umov w0, v1.b[13] +; NONEON-NOSVE-NEXT: mov v2.b[3], w8 +; NONEON-NOSVE-NEXT: udiv w6, w5, w4 +; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 +; NONEON-NOSVE-NEXT: umov w1, v0.b[13] +; NONEON-NOSVE-NEXT: mov v2.b[4], w8 +; NONEON-NOSVE-NEXT: udiv w20, w19, w7 +; NONEON-NOSVE-NEXT: msub w8, w6, w4, w5 +; NONEON-NOSVE-NEXT: mov v2.b[5], w8 +; NONEON-NOSVE-NEXT: udiv w23, w22, w21 +; NONEON-NOSVE-NEXT: msub w8, w20, w7, w19 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v2.b[6], w8 +; NONEON-NOSVE-NEXT: udiv w26, w25, w24 +; NONEON-NOSVE-NEXT: msub w8, w23, w21, w22 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v2.b[7], w8 +; NONEON-NOSVE-NEXT: udiv w9, w28, w27 +; NONEON-NOSVE-NEXT: msub w8, w26, w24, w25 +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v2.b[8], w8 +; NONEON-NOSVE-NEXT: udiv w12, w11, w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w27, w28 +; NONEON-NOSVE-NEXT: mov v2.b[9], w8 +; NONEON-NOSVE-NEXT: udiv w15, w14, w13 +; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 +; NONEON-NOSVE-NEXT: umov w10, v1.b[14] +; NONEON-NOSVE-NEXT: umov w11, v0.b[14] +; NONEON-NOSVE-NEXT: mov v2.b[10], w8 +; NONEON-NOSVE-NEXT: udiv w18, w17, w16 +; NONEON-NOSVE-NEXT: msub w8, w15, w13, w14 +; NONEON-NOSVE-NEXT: umov w13, v1.b[15] +; NONEON-NOSVE-NEXT: umov w14, v0.b[15] +; NONEON-NOSVE-NEXT: mov v2.b[11], w8 +; NONEON-NOSVE-NEXT: udiv w9, w1, w0 +; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 +; NONEON-NOSVE-NEXT: mov v2.b[12], w8 +; NONEON-NOSVE-NEXT: udiv w12, w11, w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w0, w1 +; NONEON-NOSVE-NEXT: mov v2.b[13], w8 +; NONEON-NOSVE-NEXT: udiv w9, w14, w13 +; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 +; NONEON-NOSVE-NEXT: mov v2.b[14], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 +; NONEON-NOSVE-NEXT: mov v2.b[15], w8 +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp], #80 // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ret %res = urem <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -578,6 +1579,279 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mls z2.b, p0/m, z7.b, z4.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #320 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #272] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #288] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #304] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 320 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: str x0, [sp, #216] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w8, v1.b[1] +; NONEON-NOSVE-NEXT: umov w9, v0.b[1] +; NONEON-NOSVE-NEXT: umov w4, v3.b[1] +; NONEON-NOSVE-NEXT: umov w1, v2.b[1] +; NONEON-NOSVE-NEXT: umov w7, v3.b[7] +; NONEON-NOSVE-NEXT: umov w5, v2.b[7] +; NONEON-NOSVE-NEXT: umov w6, v3.b[8] +; NONEON-NOSVE-NEXT: umov w3, v2.b[8] +; NONEON-NOSVE-NEXT: umov w22, v3.b[9] +; NONEON-NOSVE-NEXT: umov w20, v2.b[9] +; NONEON-NOSVE-NEXT: umov w13, v3.b[0] +; NONEON-NOSVE-NEXT: umov w17, v3.b[3] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #100] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w8, v1.b[0] +; NONEON-NOSVE-NEXT: str w9, [sp, #108] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w9, v0.b[0] +; NONEON-NOSVE-NEXT: umov w14, v2.b[3] +; NONEON-NOSVE-NEXT: umov w15, v3.b[4] +; NONEON-NOSVE-NEXT: umov w12, v2.b[4] +; NONEON-NOSVE-NEXT: umov w2, v3.b[5] +; NONEON-NOSVE-NEXT: umov w18, v2.b[5] +; NONEON-NOSVE-NEXT: umov w0, v3.b[6] +; NONEON-NOSVE-NEXT: umov w16, v2.b[6] +; NONEON-NOSVE-NEXT: umov w21, v3.b[10] +; NONEON-NOSVE-NEXT: umov w19, v2.b[10] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w30, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: str w10, [sp, #116] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[2] +; NONEON-NOSVE-NEXT: umov w9, v0.b[2] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #44] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[3] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #52] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w9, v0.b[3] +; NONEON-NOSVE-NEXT: udiv w26, w14, w17 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w11, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[4] +; NONEON-NOSVE-NEXT: umov w9, v0.b[4] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #60] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[5] +; NONEON-NOSVE-NEXT: umov w9, v0.b[5] +; NONEON-NOSVE-NEXT: str w8, [sp, #96] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w9, [sp, #104] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #68] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[6] +; NONEON-NOSVE-NEXT: umov w9, v0.b[6] +; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #112] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[7] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w9, v0.b[7] +; NONEON-NOSVE-NEXT: udiv w25, w12, w15 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #132] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[8] +; NONEON-NOSVE-NEXT: umov w9, v0.b[8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #140] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[9] +; NONEON-NOSVE-NEXT: umov w9, v0.b[9] +; NONEON-NOSVE-NEXT: str w8, [sp, #148] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w9, [sp, #156] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w11, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[10] +; NONEON-NOSVE-NEXT: umov w9, v0.b[10] +; NONEON-NOSVE-NEXT: str w10, [sp, #128] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #204] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[11] +; NONEON-NOSVE-NEXT: umov w9, v0.b[11] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #212] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[12] +; NONEON-NOSVE-NEXT: umov w9, v0.b[12] +; NONEON-NOSVE-NEXT: str w8, [sp, #172] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w9, [sp, #180] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #200] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[13] +; NONEON-NOSVE-NEXT: umov w9, v0.b[13] +; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #164] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w11, v3.b[2] +; NONEON-NOSVE-NEXT: str w9, [sp, #176] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #188] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.b[14] +; NONEON-NOSVE-NEXT: umov w9, v0.b[14] +; NONEON-NOSVE-NEXT: str w8, [sp, #144] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w9, [sp, #152] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: str w10, [sp, #184] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w9, v2.b[2] +; NONEON-NOSVE-NEXT: udiv w8, w1, w4 +; NONEON-NOSVE-NEXT: str w10, [sp, #160] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w10, v2.b[0] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w8, w5, w7 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w8, w3, w6 +; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w8, w20, w22 +; NONEON-NOSVE-NEXT: udiv w24, w10, w13 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w29, w8, [sp, #40] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w8, w8, w30, w29 +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fmov s4, w8 +; NONEON-NOSVE-NEXT: udiv w23, w9, w11 +; NONEON-NOSVE-NEXT: msub w10, w24, w13, w10 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w24, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w13, w13, w4, w1 +; NONEON-NOSVE-NEXT: ldr w1, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w4, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: fmov s5, w10 +; NONEON-NOSVE-NEXT: msub w1, w1, w24, w4 +; NONEON-NOSVE-NEXT: mov v5.b[1], w13 +; NONEON-NOSVE-NEXT: mov v4.b[1], w1 +; NONEON-NOSVE-NEXT: ldr w1, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w8, w23, w11, w9 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w28, w18, w2 +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #52] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #272] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[2], w8 +; NONEON-NOSVE-NEXT: msub w8, w26, w17, w14 +; NONEON-NOSVE-NEXT: ldr w14, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w11, w10 +; NONEON-NOSVE-NEXT: ldr w17, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: umov w10, v3.b[11] +; NONEON-NOSVE-NEXT: umov w11, v2.b[11] +; NONEON-NOSVE-NEXT: mov v4.b[2], w9 +; NONEON-NOSVE-NEXT: mov v5.b[3], w8 +; NONEON-NOSVE-NEXT: msub w8, w25, w15, w12 +; NONEON-NOSVE-NEXT: ldp w13, w9, [sp, #76] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w27, w16, w0 +; NONEON-NOSVE-NEXT: ldr w15, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w14, w13 +; NONEON-NOSVE-NEXT: ldr w14, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[4], w8 +; NONEON-NOSVE-NEXT: msub w8, w28, w2, w18 +; NONEON-NOSVE-NEXT: ldr w2, [sp, #156] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[3], w9 +; NONEON-NOSVE-NEXT: ldp w12, w9, [sp, #64] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[5], w8 +; NONEON-NOSVE-NEXT: msub w8, w27, w0, w16 +; NONEON-NOSVE-NEXT: ldr w0, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w4, w19, w21 +; NONEON-NOSVE-NEXT: msub w9, w9, w14, w12 +; NONEON-NOSVE-NEXT: umov w12, v3.b[12] +; NONEON-NOSVE-NEXT: umov w14, v2.b[12] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[6], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[4], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w8, w8, w7, w5 +; NONEON-NOSVE-NEXT: ldr w5, [sp, #204] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w17, w15 +; NONEON-NOSVE-NEXT: ldr w17, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[7], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w13, w11, w10 +; NONEON-NOSVE-NEXT: mov v4.b[5], w9 +; NONEON-NOSVE-NEXT: ldp w16, w9, [sp, #88] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w8, w8, w6, w3 +; NONEON-NOSVE-NEXT: ldr w3, [sp, #148] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w17, w16 +; NONEON-NOSVE-NEXT: umov w16, v3.b[13] +; NONEON-NOSVE-NEXT: umov w17, v2.b[13] +; NONEON-NOSVE-NEXT: mov v5.b[8], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[6], w9 +; NONEON-NOSVE-NEXT: msub w8, w8, w22, w20 +; NONEON-NOSVE-NEXT: udiv w15, w14, w12 +; NONEON-NOSVE-NEXT: ldp w18, w9, [sp, #136] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[9], w8 +; NONEON-NOSVE-NEXT: msub w8, w4, w21, w19 +; NONEON-NOSVE-NEXT: msub w9, w9, w0, w18 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #304] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #288] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[7], w9 +; NONEON-NOSVE-NEXT: mov v5.b[10], w8 +; NONEON-NOSVE-NEXT: msub w8, w13, w10, w11 +; NONEON-NOSVE-NEXT: ldp w0, w9, [sp, #124] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #196] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w13, [sp, #192] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w18, w17, w16 +; NONEON-NOSVE-NEXT: msub w9, w9, w1, w0 +; NONEON-NOSVE-NEXT: mov v5.b[11], w8 +; NONEON-NOSVE-NEXT: umov w0, v3.b[14] +; NONEON-NOSVE-NEXT: msub w10, w10, w13, w11 +; NONEON-NOSVE-NEXT: umov w1, v2.b[14] +; NONEON-NOSVE-NEXT: msub w8, w15, w12, w14 +; NONEON-NOSVE-NEXT: mov v4.b[8], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #164] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w15, w13, [sp, #168] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w3, w2 +; NONEON-NOSVE-NEXT: mov v5.b[12], w8 +; NONEON-NOSVE-NEXT: ldp w4, w3, [sp, #208] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w14, w12, [sp, #176] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[9], w9 +; NONEON-NOSVE-NEXT: udiv w2, w1, w0 +; NONEON-NOSVE-NEXT: umov w9, v3.b[15] +; NONEON-NOSVE-NEXT: msub w3, w3, w5, w4 +; NONEON-NOSVE-NEXT: umov w4, v2.b[15] +; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 +; NONEON-NOSVE-NEXT: ldr w16, [sp, #144] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.b[10], w3 +; NONEON-NOSVE-NEXT: mov v5.b[13], w8 +; NONEON-NOSVE-NEXT: mov v4.b[11], w10 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #188] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w11, w4, w9 +; NONEON-NOSVE-NEXT: msub w8, w2, w0, w1 +; NONEON-NOSVE-NEXT: msub w10, w10, w13, w12 +; NONEON-NOSVE-NEXT: umov w12, v1.b[15] +; NONEON-NOSVE-NEXT: umov w13, v0.b[15] +; NONEON-NOSVE-NEXT: mov v5.b[14], w8 +; NONEON-NOSVE-NEXT: mov v4.b[12], w10 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #184] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w10, w10, w15, w14 +; NONEON-NOSVE-NEXT: ldr w15, [sp, #152] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w14, w13, w12 +; NONEON-NOSVE-NEXT: msub w8, w11, w9, w4 +; NONEON-NOSVE-NEXT: mov v4.b[13], w10 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #160] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.b[15], w8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #216] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w10, w10, w16, w15 +; NONEON-NOSVE-NEXT: mov v4.b[14], w10 +; NONEON-NOSVE-NEXT: msub w9, w14, w12, w13 +; NONEON-NOSVE-NEXT: mov v4.b[15], w9 +; NONEON-NOSVE-NEXT: stp q5, q4, [x8] +; NONEON-NOSVE-NEXT: add sp, sp, #320 +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = urem <32 x i8> %op1, %op2 @@ -599,6 +1873,33 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: umov w11, v1.h[0] +; NONEON-NOSVE-NEXT: umov w12, v0.h[0] +; NONEON-NOSVE-NEXT: umov w8, v1.h[1] +; NONEON-NOSVE-NEXT: umov w9, v0.h[1] +; NONEON-NOSVE-NEXT: umov w14, v1.h[2] +; NONEON-NOSVE-NEXT: umov w15, v0.h[2] +; NONEON-NOSVE-NEXT: umov w17, v1.h[3] +; NONEON-NOSVE-NEXT: umov w18, v0.h[3] +; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: fmov s0, w11 +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: mov v0.h[1], w8 +; NONEON-NOSVE-NEXT: udiv w9, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: mov v0.h[2], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 +; NONEON-NOSVE-NEXT: mov v0.h[3], w8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = urem <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -627,6 +1928,51 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: umov w11, v1.h[0] +; NONEON-NOSVE-NEXT: umov w12, v0.h[0] +; NONEON-NOSVE-NEXT: umov w8, v1.h[1] +; NONEON-NOSVE-NEXT: umov w9, v0.h[1] +; NONEON-NOSVE-NEXT: umov w14, v1.h[2] +; NONEON-NOSVE-NEXT: umov w15, v0.h[2] +; NONEON-NOSVE-NEXT: umov w17, v1.h[3] +; NONEON-NOSVE-NEXT: umov w18, v0.h[3] +; NONEON-NOSVE-NEXT: umov w1, v1.h[4] +; NONEON-NOSVE-NEXT: umov w2, v0.h[4] +; NONEON-NOSVE-NEXT: umov w4, v1.h[5] +; NONEON-NOSVE-NEXT: umov w5, v0.h[5] +; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: umov w13, v1.h[7] +; NONEON-NOSVE-NEXT: fmov s2, w11 +; NONEON-NOSVE-NEXT: umov w11, v0.h[6] +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: umov w10, v1.h[6] +; NONEON-NOSVE-NEXT: mov v2.h[1], w8 +; NONEON-NOSVE-NEXT: udiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: umov w14, v0.h[7] +; NONEON-NOSVE-NEXT: mov v2.h[2], w8 +; NONEON-NOSVE-NEXT: udiv w3, w2, w1 +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: mov v2.h[3], w8 +; NONEON-NOSVE-NEXT: udiv w9, w5, w4 +; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 +; NONEON-NOSVE-NEXT: mov v2.h[4], w8 +; NONEON-NOSVE-NEXT: udiv w12, w11, w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 +; NONEON-NOSVE-NEXT: mov v2.h[5], w8 +; NONEON-NOSVE-NEXT: udiv w9, w14, w13 +; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 +; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 +; NONEON-NOSVE-NEXT: mov v2.h[7], w8 +; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %res = urem <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -671,6 +2017,139 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: mls z0.h, p0/m, z7.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #96] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #112] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q2, [x0] +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: umov w8, v1.h[1] +; NONEON-NOSVE-NEXT: umov w9, v0.h[1] +; NONEON-NOSVE-NEXT: umov w20, v1.h[0] +; NONEON-NOSVE-NEXT: umov w21, v0.h[0] +; NONEON-NOSVE-NEXT: umov w19, v0.h[3] +; NONEON-NOSVE-NEXT: umov w5, v1.h[4] +; NONEON-NOSVE-NEXT: umov w2, v0.h[4] +; NONEON-NOSVE-NEXT: umov w1, v3.h[1] +; NONEON-NOSVE-NEXT: umov w23, v2.h[1] +; NONEON-NOSVE-NEXT: umov w25, v3.h[0] +; NONEON-NOSVE-NEXT: umov w26, v2.h[0] +; NONEON-NOSVE-NEXT: umov w6, v1.h[5] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w8, v1.h[2] +; NONEON-NOSVE-NEXT: umov w9, v0.h[2] +; NONEON-NOSVE-NEXT: umov w3, v0.h[5] +; NONEON-NOSVE-NEXT: umov w4, v1.h[6] +; NONEON-NOSVE-NEXT: umov w7, v0.h[6] +; NONEON-NOSVE-NEXT: umov w28, v3.h[2] +; NONEON-NOSVE-NEXT: umov w29, v2.h[2] +; NONEON-NOSVE-NEXT: umov w15, v3.h[3] +; NONEON-NOSVE-NEXT: umov w13, v2.h[3] +; NONEON-NOSVE-NEXT: umov w12, v3.h[4] +; NONEON-NOSVE-NEXT: umov w14, v3.h[5] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w11, w21, w20 +; NONEON-NOSVE-NEXT: str w10, [sp, #44] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: umov w8, v1.h[3] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w11, v2.h[4] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #4] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w20, w22, w20, w21 +; NONEON-NOSVE-NEXT: udiv w9, w19, w8 +; NONEON-NOSVE-NEXT: str w10, [sp, #32] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w10, v3.h[6] +; NONEON-NOSVE-NEXT: fmov s5, w20 +; NONEON-NOSVE-NEXT: umov w20, v3.h[7] +; NONEON-NOSVE-NEXT: udiv w8, w2, w5 +; NONEON-NOSVE-NEXT: udiv w24, w23, w1 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: udiv w27, w26, w25 +; NONEON-NOSVE-NEXT: msub w1, w24, w1, w23 +; NONEON-NOSVE-NEXT: ldp w24, w23, [sp, #40] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w9, w3, w6 +; NONEON-NOSVE-NEXT: msub w21, w27, w25, w26 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w23, w23, w25, w24 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: fmov s4, w21 +; NONEON-NOSVE-NEXT: mov v5.h[1], w23 +; NONEON-NOSVE-NEXT: ldp w23, w21, [sp, #28] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.h[1], w1 +; NONEON-NOSVE-NEXT: udiv w8, w7, w4 +; NONEON-NOSVE-NEXT: msub w21, w21, w25, w23 +; NONEON-NOSVE-NEXT: umov w23, v2.h[7] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.h[2], w21 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: udiv w30, w29, w28 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: umov w9, v2.h[5] +; NONEON-NOSVE-NEXT: umov w8, v2.h[6] +; NONEON-NOSVE-NEXT: udiv w18, w13, w15 +; NONEON-NOSVE-NEXT: msub w1, w30, w28, w29 +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.h[2], w1 +; NONEON-NOSVE-NEXT: udiv w16, w11, w12 +; NONEON-NOSVE-NEXT: msub w13, w18, w15, w13 +; NONEON-NOSVE-NEXT: ldr w15, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w18, [sp] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w15, w15, w18, w19 +; NONEON-NOSVE-NEXT: mov v4.h[3], w13 +; NONEON-NOSVE-NEXT: umov w13, v1.h[7] +; NONEON-NOSVE-NEXT: mov v5.h[3], w15 +; NONEON-NOSVE-NEXT: umov w15, v0.h[7] +; NONEON-NOSVE-NEXT: udiv w17, w9, w14 +; NONEON-NOSVE-NEXT: msub w11, w16, w12, w11 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #16] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w12, w12, w5, w2 +; NONEON-NOSVE-NEXT: mov v4.h[4], w11 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v5.h[4], w12 +; NONEON-NOSVE-NEXT: msub w11, w11, w6, w3 +; NONEON-NOSVE-NEXT: udiv w24, w8, w10 +; NONEON-NOSVE-NEXT: msub w9, w17, w14, w9 +; NONEON-NOSVE-NEXT: mov v5.h[5], w11 +; NONEON-NOSVE-NEXT: mov v4.h[5], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: msub w9, w9, w4, w7 +; NONEON-NOSVE-NEXT: udiv w18, w23, w20 +; NONEON-NOSVE-NEXT: msub w8, w24, w10, w8 +; NONEON-NOSVE-NEXT: mov v5.h[6], w9 +; NONEON-NOSVE-NEXT: mov v4.h[6], w8 +; NONEON-NOSVE-NEXT: udiv w12, w15, w13 +; NONEON-NOSVE-NEXT: msub w8, w18, w20, w23 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #96] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v4.h[7], w8 +; NONEON-NOSVE-NEXT: msub w9, w12, w13, w15 +; NONEON-NOSVE-NEXT: mov v5.h[7], w9 +; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #144 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = urem <16 x i16> %op1, %op2 @@ -689,6 +2168,23 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: fmov w9, s0 +; NONEON-NOSVE-NEXT: mov w11, v1.s[1] +; NONEON-NOSVE-NEXT: mov w12, v0.s[1] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 +; NONEON-NOSVE-NEXT: mov v0.s[1], w9 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = urem <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -704,6 +2200,30 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov w11, s1 +; NONEON-NOSVE-NEXT: fmov w12, s0 +; NONEON-NOSVE-NEXT: mov w8, v1.s[1] +; NONEON-NOSVE-NEXT: mov w9, v0.s[1] +; NONEON-NOSVE-NEXT: mov w14, v1.s[2] +; NONEON-NOSVE-NEXT: mov w15, v0.s[2] +; NONEON-NOSVE-NEXT: mov w17, v1.s[3] +; NONEON-NOSVE-NEXT: mov w18, v0.s[3] +; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: fmov s0, w11 +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: mov v0.s[1], w8 +; NONEON-NOSVE-NEXT: udiv w9, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: mov v0.s[2], w8 +; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 +; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ret %res = urem <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -723,6 +2243,65 @@ define void @urem_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -48 +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: fmov w12, s0 +; NONEON-NOSVE-NEXT: fmov w3, s2 +; NONEON-NOSVE-NEXT: mov w9, v0.s[1] +; NONEON-NOSVE-NEXT: fmov w11, s1 +; NONEON-NOSVE-NEXT: fmov w2, s3 +; NONEON-NOSVE-NEXT: mov w8, v1.s[1] +; NONEON-NOSVE-NEXT: mov w17, v3.s[1] +; NONEON-NOSVE-NEXT: mov w18, v2.s[1] +; NONEON-NOSVE-NEXT: mov w14, v1.s[2] +; NONEON-NOSVE-NEXT: mov w15, v0.s[2] +; NONEON-NOSVE-NEXT: mov w5, v3.s[2] +; NONEON-NOSVE-NEXT: mov w6, v2.s[2] +; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: mov w19, v3.s[3] +; NONEON-NOSVE-NEXT: mov w20, v2.s[3] +; NONEON-NOSVE-NEXT: mov w22, v1.s[3] +; NONEON-NOSVE-NEXT: mov w23, v0.s[3] +; NONEON-NOSVE-NEXT: udiv w4, w3, w2 +; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 +; NONEON-NOSVE-NEXT: fmov s1, w11 +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w12, w4, w2, w3 +; NONEON-NOSVE-NEXT: fmov s0, w12 +; NONEON-NOSVE-NEXT: udiv w1, w18, w17 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: mov v1.s[1], w8 +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w13, w1, w17, w18 +; NONEON-NOSVE-NEXT: mov v0.s[1], w13 +; NONEON-NOSVE-NEXT: udiv w7, w6, w5 +; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 +; NONEON-NOSVE-NEXT: mov v1.s[2], w8 +; NONEON-NOSVE-NEXT: udiv w21, w20, w19 +; NONEON-NOSVE-NEXT: msub w10, w7, w5, w6 +; NONEON-NOSVE-NEXT: mov v0.s[2], w10 +; NONEON-NOSVE-NEXT: udiv w9, w23, w22 +; NONEON-NOSVE-NEXT: msub w10, w21, w19, w20 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v0.s[3], w10 +; NONEON-NOSVE-NEXT: msub w8, w9, w22, w23 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = urem <8 x i32> %op1, %op2 @@ -741,6 +2320,17 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: udiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ret %res = urem <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -756,6 +2346,20 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: mov x11, v1.d[1] +; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: udiv x10, x9, x8 +; NONEON-NOSVE-NEXT: udiv x13, x12, x11 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: ret %res = urem <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -775,6 +2379,33 @@ define void @urem_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: urem_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: fmov x15, d2 +; NONEON-NOSVE-NEXT: mov x12, v2.d[1] +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x14, d3 +; NONEON-NOSVE-NEXT: mov x11, v3.d[1] +; NONEON-NOSVE-NEXT: mov x17, v1.d[1] +; NONEON-NOSVE-NEXT: mov x18, v0.d[1] +; NONEON-NOSVE-NEXT: udiv x10, x9, x8 +; NONEON-NOSVE-NEXT: udiv x16, x15, x14 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: udiv x13, x12, x11 +; NONEON-NOSVE-NEXT: msub x10, x16, x14, x15 +; NONEON-NOSVE-NEXT: fmov d0, x10 +; NONEON-NOSVE-NEXT: udiv x1, x18, x17 +; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 +; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: msub x11, x1, x17, x18 +; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = urem <4 x i64> %op1, %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll index bfffe4b6315d74..0108fb580b947b 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -16,6 +17,14 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.4h, w8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2 ret <4 x i8> %sel } @@ -31,6 +40,14 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.8b, w8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2 ret <8 x i8> %sel } @@ -46,6 +63,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.16b, w8 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2 ret <16 x i8> %sel } @@ -64,6 +89,20 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) { ; CHECK-NEXT: sel z1.b, p0, z1.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w2, #0x1 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] +; NONEON-NOSVE-NEXT: dup v0.16b, w8 +; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load volatile <32 x i8>, ptr %a %op2 = load volatile <32 x i8>, ptr %b %sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2 @@ -83,6 +122,14 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.2s, w8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2 ret <2 x i16> %sel } @@ -99,6 +146,14 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.4h, w8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2 ret <4 x i16> %sel } @@ -115,6 +170,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.8h, w8 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2 ret <8 x i16> %sel } @@ -134,6 +197,20 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) { ; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w2, #0x1 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load volatile <16 x i16>, ptr %a %op2 = load volatile <16 x i16>, ptr %b %sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2 @@ -153,6 +230,14 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.2s, w8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2 ret <2 x i32> %sel } @@ -169,6 +254,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: dup v2.4s, w8 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2 ret <4 x i32> %sel } @@ -188,6 +281,20 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) { ; CHECK-NEXT: sel z1.s, p0, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w2, #0x1 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load volatile <8 x i32>, ptr %a %op2 = load volatile <8 x i32>, ptr %b %sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2 @@ -208,6 +315,14 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm x8, ne +; NONEON-NOSVE-NEXT: fmov d2, x8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2 ret <1 x i64> %sel } @@ -225,6 +340,14 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) { ; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm x8, ne +; NONEON-NOSVE-NEXT: dup v2.2d, x8 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2 ret <2 x i64> %sel } @@ -245,6 +368,20 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) { ; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w2, #0x1 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: csetm x8, ne +; NONEON-NOSVE-NEXT: ldr q3, [x1] +; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b +; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load volatile <4 x i64>, ptr %a %op2 = load volatile <4 x i64>, ptr %b %sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll index 9319bd69c25fb6..f7198e3042ad53 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -19,6 +20,16 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h +; NONEON-NOSVE-NEXT: sshl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = ashr <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -32,6 +43,12 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.8b, v1.8b +; NONEON-NOSVE-NEXT: sshl v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = ashr <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -45,6 +62,12 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b +; NONEON-NOSVE-NEXT: sshl v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = ashr <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -60,6 +83,17 @@ define void @ashr_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: asr z1.b, p0/m, z1.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: neg v0.16b, v0.16b +; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b +; NONEON-NOSVE-NEXT: sshl v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: sshl v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = ashr <32 x i8> %op1, %op2 @@ -78,6 +112,16 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s +; NONEON-NOSVE-NEXT: sshl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = ashr <2 x i16> %op1, %op2 ret <2 x i16> %res } @@ -91,6 +135,12 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h +; NONEON-NOSVE-NEXT: sshl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = ashr <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -104,6 +154,12 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h +; NONEON-NOSVE-NEXT: sshl v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = ashr <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -119,6 +175,17 @@ define void @ashr_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: asr z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: neg v0.8h, v0.8h +; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h +; NONEON-NOSVE-NEXT: sshl v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: sshl v1.8h, v3.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = ashr <16 x i16> %op1, %op2 @@ -135,6 +202,12 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s +; NONEON-NOSVE-NEXT: sshl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = ashr <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -148,6 +221,12 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s +; NONEON-NOSVE-NEXT: sshl v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = ashr <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -163,6 +242,17 @@ define void @ashr_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: asr z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: neg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s +; NONEON-NOSVE-NEXT: sshl v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: sshl v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = ashr <8 x i32> %op1, %op2 @@ -179,6 +269,12 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg d1, d1 +; NONEON-NOSVE-NEXT: sshl d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = ashr <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -192,6 +288,12 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d +; NONEON-NOSVE-NEXT: sshl v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = ashr <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -207,6 +309,17 @@ define void @ashr_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: asr z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ashr_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: neg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d +; NONEON-NOSVE-NEXT: sshl v0.2d, v2.2d, v0.2d +; NONEON-NOSVE-NEXT: sshl v1.2d, v3.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = ashr <4 x i64> %op1, %op2 @@ -229,6 +342,15 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b +; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h +; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = lshr <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -242,6 +364,12 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.8b, v1.8b +; NONEON-NOSVE-NEXT: ushl v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = lshr <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -255,6 +383,12 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ushl v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = lshr <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -270,6 +404,17 @@ define void @lshr_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: lsr z1.b, p0/m, z1.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: neg v0.16b, v0.16b +; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ushl v0.16b, v2.16b, v0.16b +; NONEON-NOSVE-NEXT: ushl v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = lshr <32 x i8> %op1, %op2 @@ -288,6 +433,15 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b +; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s +; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = lshr <2 x i16> %op1, %op2 ret <2 x i16> %res } @@ -301,6 +455,12 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h +; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = lshr <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -314,6 +474,12 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ushl v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = lshr <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -329,6 +495,17 @@ define void @lshr_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: lsr z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: neg v0.8h, v0.8h +; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ushl v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: ushl v1.8h, v3.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = lshr <16 x i16> %op1, %op2 @@ -345,6 +522,12 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s +; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = lshr <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -358,6 +541,12 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ushl v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = lshr <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -373,6 +562,17 @@ define void @lshr_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: lsr z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: neg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ushl v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: ushl v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = lshr <8 x i32> %op1, %op2 @@ -389,6 +589,12 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg d1, d1 +; NONEON-NOSVE-NEXT: ushl d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = lshr <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -402,6 +608,12 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ushl v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = lshr <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -417,6 +629,17 @@ define void @lshr_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: lshr_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: neg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ushl v0.2d, v2.2d, v0.2d +; NONEON-NOSVE-NEXT: ushl v1.2d, v3.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = lshr <4 x i64> %op1, %op2 @@ -438,6 +661,13 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) { ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v2i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d2, #0x0000ff000000ff +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = shl <2 x i8> %op1, %op2 ret <2 x i8> %res } @@ -452,6 +682,13 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = shl <4 x i8> %op1, %op2 ret <4 x i8> %res } @@ -465,6 +702,11 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushl v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ret %res = shl <8 x i8> %op1, %op2 ret <8 x i8> %res } @@ -478,6 +720,11 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushl v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %res = shl <16 x i8> %op1, %op2 ret <16 x i8> %res } @@ -493,6 +740,15 @@ define void @shl_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: lsl z1.b, p0/m, z1.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: ushl v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: ushl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = shl <32 x i8> %op1, %op2 @@ -509,6 +765,11 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %res = shl <4 x i16> %op1, %op2 ret <4 x i16> %res } @@ -522,6 +783,11 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushl v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ret %res = shl <8 x i16> %op1, %op2 ret <8 x i16> %res } @@ -537,6 +803,15 @@ define void @shl_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: ushl v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: ushl v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = shl <16 x i16> %op1, %op2 @@ -553,6 +828,11 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: ret %res = shl <2 x i32> %op1, %op2 ret <2 x i32> %res } @@ -566,6 +846,11 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushl v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ret %res = shl <4 x i32> %op1, %op2 ret <4 x i32> %res } @@ -581,6 +866,15 @@ define void @shl_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: ushl v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: ushl v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %res = shl <8 x i32> %op1, %op2 @@ -597,6 +891,11 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushl d0, d0, d1 +; NONEON-NOSVE-NEXT: ret %res = shl <1 x i64> %op1, %op2 ret <1 x i64> %res } @@ -610,6 +909,11 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushl v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ret %res = shl <2 x i64> %op1, %op2 ret <2 x i64> %res } @@ -625,6 +929,15 @@ define void @shl_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shl_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: ushl v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: ushl v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %res = shl <4 x i64> %op1, %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 27dbfc9a23a8de..42d3b9d8f71f86 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -15,6 +16,13 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) { ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i16> %op1 to <4 x half> ret <4 x half> %res } @@ -27,6 +35,22 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) { ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ushll v1.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v1.4s +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: str q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = uitofp <8 x i16> %op1 to <8 x half> store <8 x half> %res, ptr %b @@ -42,6 +66,29 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ucvtf z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ushll v2.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v0.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ucvtf v2.4s, v2.4s +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ucvtf v3.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v1.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v3.4s +; NONEON-NOSVE-NEXT: stp q2, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x half> store <16 x half> %res, ptr %b @@ -61,6 +108,13 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) { ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ucvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i16> %op1 to <2 x float> ret <2 x float> %res } @@ -74,6 +128,12 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) { ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i16> %op1 to <4 x float> ret <4 x float> %res } @@ -90,6 +150,20 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = uitofp <8 x i16> %op1 to <8 x float> store <8 x float> %res, ptr %b @@ -114,6 +188,26 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ucvtf v2.4s, v2.4s +; NONEON-NOSVE-NEXT: ucvtf v3.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x float> store <16 x float> %res, ptr %b @@ -132,6 +226,13 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) { ; CHECK-NEXT: and w8, w8, #0xffff ; CHECK-NEXT: ucvtf d0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: umov w8, v0.h[0] +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ret %res = uitofp <1 x i16> %op1 to <1 x double> ret <1 x double> %res } @@ -146,6 +247,14 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) { ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i16> %op1 to <2 x double> ret <2 x double> %res } @@ -163,6 +272,21 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %res = uitofp <4 x i16> %op1 to <4 x double> store <4 x double> %res, ptr %b @@ -191,6 +315,30 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q3, [x1] ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d +; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q2, [x1] +; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = uitofp <8 x i16> %op1 to <8 x double> store <8 x double> %res, ptr %b @@ -239,6 +387,46 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: stp q3, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] +; NONEON-NOSVE-NEXT: ldr d6, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] +; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ushll v6.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: ushll v7.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d +; NONEON-NOSVE-NEXT: ucvtf v5.2d, v5.2d +; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d +; NONEON-NOSVE-NEXT: ucvtf v4.2d, v4.2d +; NONEON-NOSVE-NEXT: stp q0, q5, [x1] +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v7.2d +; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v6.2d +; NONEON-NOSVE-NEXT: stp q2, q0, [x1, #32] +; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x double> store <16 x double> %res, ptr %b @@ -258,6 +446,13 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x half> ret <2 x half> %res } @@ -271,6 +466,12 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i32> %op1 to <4 x half> ret <4 x half> %res } @@ -288,6 +489,15 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x half> ret <8 x half> %res @@ -312,6 +522,21 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q2, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v16i32_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x0] +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ucvtf v2.4s, v2.4s +; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ucvtf v3.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v1.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v3.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i32>, ptr %a %res = uitofp <16 x i32> %op1 to <16 x half> store <16 x half> %res, ptr %b @@ -330,6 +555,11 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) { ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ucvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x float> ret <2 x float> %res } @@ -342,6 +572,11 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) { ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i32> %op1 to <4 x float> ret <4 x float> %res } @@ -355,6 +590,14 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x float> store <8 x float> %res, ptr %b @@ -374,6 +617,12 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) { ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x double> ret <2 x double> %res } @@ -390,6 +639,20 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = uitofp <4 x i32> %op1 to <4 x double> store <4 x double> %res, ptr %b @@ -414,6 +677,26 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d +; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x double> store <8 x double> %res, ptr %b @@ -440,6 +723,18 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov x8, v0.d[1] +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h2, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s1 +; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x half> ret <2 x half> %res } @@ -460,6 +755,16 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x half> ret <4 x half> %res @@ -493,6 +798,22 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) { ; CHECK-NEXT: splice z0.h, p0, z0.h, z2.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d +; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: fcvtn v2.2s, v2.2d +; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: fcvtn2 v2.4s, v3.2d +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v2.4s +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i64>, ptr %a %res = uitofp <8 x i64> %op1 to <8 x half> ret <8 x half> %res @@ -511,6 +832,12 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res } @@ -528,6 +855,15 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) { ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x float> ret <4 x float> %res @@ -552,6 +888,21 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q2, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x0] +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: fcvtn v1.2s, v1.2d +; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v2.2d +; NONEON-NOSVE-NEXT: fcvtn2 v1.4s, v3.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i64>, ptr %a %res = uitofp <8 x i64> %op1 to <8 x float> store <8 x float> %res, ptr %b @@ -570,6 +921,11 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) { ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x double> ret <2 x double> %res } @@ -583,6 +939,14 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x double> store <4 x double> %res, ptr %b @@ -601,6 +965,13 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) { ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i16> %op1 to <4 x half> ret <4 x half> %res } @@ -613,6 +984,22 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) { ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: sshll v1.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v1.4s +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: str q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = sitofp <8 x i16> %op1 to <8 x half> store <8 x half> %res, ptr %b @@ -628,6 +1015,29 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: scvtf z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: sshll v2.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v0.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: scvtf v2.4s, v2.4s +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: scvtf v3.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v1.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v3.4s +; NONEON-NOSVE-NEXT: stp q2, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x half> store <16 x half> %res, ptr %b @@ -646,6 +1056,13 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) { ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: scvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i16> %op1 to <2 x float> ret <2 x float> %res } @@ -659,6 +1076,12 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) { ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i16> %op1 to <4 x float> ret <4 x float> %res } @@ -675,6 +1098,20 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = sitofp <8 x i16> %op1 to <8 x float> store <8 x float> %res, ptr %b @@ -699,6 +1136,26 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: scvtf v2.4s, v2.4s +; NONEON-NOSVE-NEXT: scvtf v3.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x float> store <16 x float> %res, ptr %b @@ -720,6 +1177,14 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) { ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i16> %op1 to <2 x double> ret <2 x double> %res } @@ -737,6 +1202,21 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %res = sitofp <4 x i16> %op1 to <4 x double> store <4 x double> %res, ptr %b @@ -765,6 +1245,30 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q3, [x1] ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d +; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q2, [x1] +; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = sitofp <8 x i16> %op1 to <8 x double> store <8 x double> %res, ptr %b @@ -813,6 +1317,46 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: stp q3, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 +; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] +; NONEON-NOSVE-NEXT: ldr d6, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] +; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: sshll v6.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: sshll v7.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d +; NONEON-NOSVE-NEXT: scvtf v5.2d, v5.2d +; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d +; NONEON-NOSVE-NEXT: scvtf v4.2d, v4.2d +; NONEON-NOSVE-NEXT: stp q0, q5, [x1] +; NONEON-NOSVE-NEXT: scvtf v0.2d, v7.2d +; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] +; NONEON-NOSVE-NEXT: scvtf v1.2d, v6.2d +; NONEON-NOSVE-NEXT: stp q2, q0, [x1, #32] +; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x double> store <16 x double> %res, ptr %b @@ -832,6 +1376,13 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x half> ret <2 x half> %res } @@ -845,6 +1396,12 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i32> %op1 to <4 x half> ret <4 x half> %res } @@ -862,6 +1419,15 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x half> ret <8 x half> %res @@ -879,6 +1445,11 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) { ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: scvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x float> ret <2 x float> %res } @@ -891,6 +1462,11 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) { ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i32> %op1 to <4 x float> ret <4 x float> %res } @@ -904,6 +1480,14 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: scvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x float> store <8 x float> %res, ptr %b @@ -923,6 +1507,12 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) { ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x double> ret <2 x double> %res } @@ -939,6 +1529,20 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = sitofp <4 x i32> %op1 to <4 x double> store <4 x double> %res, ptr %b @@ -963,6 +1567,26 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q2, q0, [x1, #32] ; CHECK-NEXT: stp q3, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d +; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x double> store <8 x double> %res, ptr %b @@ -1007,6 +1631,40 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q2, q1, [x1] ; CHECK-NEXT: stp q4, q0, [x1, #32] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] +; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] +; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 +; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] +; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 +; NONEON-NOSVE-NEXT: ldr d7, [sp, #8] +; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 +; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d +; NONEON-NOSVE-NEXT: sshll v6.2d, v6.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d +; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: sshll v7.2d, v7.2s, #0 +; NONEON-NOSVE-NEXT: scvtf v4.2d, v4.2d +; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: scvtf v5.2d, v5.2d +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: stp q2, q4, [x1, #96] +; NONEON-NOSVE-NEXT: scvtf v2.2d, v6.2d +; NONEON-NOSVE-NEXT: stp q3, q5, [x1, #64] +; NONEON-NOSVE-NEXT: scvtf v3.2d, v7.2d +; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] +; NONEON-NOSVE-NEXT: stp q0, q3, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i32>, ptr %a %res = sitofp <16 x i32> %op1 to <16 x double> store <16 x double> %res, ptr %b @@ -1033,6 +1691,18 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) { ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov x8, v0.d[1] +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: scvtf s1, x9 +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h2, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s1 +; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x half> ret <2 x half> %res } @@ -1053,6 +1723,16 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x half> ret <4 x half> %res @@ -1071,6 +1751,12 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res } @@ -1088,6 +1774,15 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) { ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x float> ret <4 x float> %res @@ -1105,6 +1800,11 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) { ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x double> ret <2 x double> %res } @@ -1118,6 +1818,14 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: scvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x double> store <4 x double> %res, ptr %b @@ -1130,6 +1838,13 @@ define half @scvtf_i16_f16(ptr %0) { ; CHECK-NEXT: ldrsh w8, [x0] ; CHECK-NEXT: scvtf h0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_i16_f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldrsh w8, [x0] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %2 = load i16, ptr %0, align 64 %3 = sitofp i16 %2 to half ret half %3 @@ -1141,6 +1856,12 @@ define float @scvtf_i16_f32(ptr %0) { ; CHECK-NEXT: ldrsh w8, [x0] ; CHECK-NEXT: scvtf s0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_i16_f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldrsh w8, [x0] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ret %2 = load i16, ptr %0, align 64 %3 = sitofp i16 %2 to float ret float %3 @@ -1152,6 +1873,12 @@ define double @scvtf_i16_f64(ptr %0) { ; CHECK-NEXT: ldrsh w8, [x0] ; CHECK-NEXT: scvtf d0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_i16_f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldrsh w8, [x0] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ret %2 = load i16, ptr %0, align 64 %3 = sitofp i16 %2 to double ret double %3 @@ -1163,6 +1890,13 @@ define half @scvtf_i32_f16(ptr %0) { ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: scvtf h0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_i32_f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %2 = load i32, ptr %0, align 64 %3 = sitofp i32 %2 to half ret half %3 @@ -1174,6 +1908,12 @@ define float @scvtf_i32_f32(ptr %0) { ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: scvtf s0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_i32_f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ret %2 = load i32, ptr %0, align 64 %3 = sitofp i32 %2 to float ret float %3 @@ -1185,6 +1925,12 @@ define double @scvtf_i32_f64(ptr %0) { ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: scvtf d0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_i32_f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ret %2 = load i32, ptr %0, align 64 %3 = sitofp i32 %2 to double ret double %3 @@ -1196,6 +1942,13 @@ define half @scvtf_i64_f16(ptr %0) { ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: scvtf h0, x8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_i64_f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr x8, [x0] +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %2 = load i64, ptr %0, align 64 %3 = sitofp i64 %2 to half ret half %3 @@ -1207,6 +1960,12 @@ define float @scvtf_i64_f32(ptr %0) { ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: scvtf s0, x8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_i64_f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr x8, [x0] +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: ret %2 = load i64, ptr %0, align 64 %3 = sitofp i64 %2 to float ret float %3 @@ -1218,6 +1977,12 @@ define double @scvtf_i64_f64(ptr %0) { ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: scvtf d0, x8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: scvtf_i64_f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr x8, [x0] +; NONEON-NOSVE-NEXT: scvtf d0, x8 +; NONEON-NOSVE-NEXT: ret %2 = load i64, ptr %0, align 64 %3 = sitofp i64 %2 to double ret double %3 @@ -1229,6 +1994,13 @@ define half @ucvtf_i16_f16(ptr %0) { ; CHECK-NEXT: ldrh w8, [x0] ; CHECK-NEXT: ucvtf h0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_i16_f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %2 = load i16, ptr %0, align 64 %3 = uitofp i16 %2 to half ret half %3 @@ -1240,6 +2012,12 @@ define float @ucvtf_i16_f32(ptr %0) { ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ucvtf s0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_i16_f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ret %2 = load i16, ptr %0, align 64 %3 = uitofp i16 %2 to float ret float %3 @@ -1251,6 +2029,12 @@ define double @ucvtf_i16_f64(ptr %0) { ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ucvtf d0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_i16_f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ret %2 = load i16, ptr %0, align 64 %3 = uitofp i16 %2 to double ret double %3 @@ -1262,6 +2046,13 @@ define half @ucvtf_i32_f16(ptr %0) { ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: ucvtf h0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_i32_f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %2 = load i32, ptr %0, align 64 %3 = uitofp i32 %2 to half ret half %3 @@ -1273,6 +2064,12 @@ define float @ucvtf_i32_f32(ptr %0) { ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: ucvtf s0, w8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_i32_f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ret %2 = load i32, ptr %0, align 64 %3 = uitofp i32 %2 to float ret float %3 @@ -1284,6 +2081,12 @@ define double @ucvtf_i32_f64(ptr %0) { ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ucvtf d0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_i32_f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ret %2 = load i32, ptr %0, align 64 %3 = uitofp i32 %2 to double ret double %3 @@ -1295,6 +2098,13 @@ define half @ucvtf_i64_f16(ptr %0) { ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: ucvtf h0, x8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_i64_f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr x8, [x0] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: ret %2 = load i64, ptr %0, align 64 %3 = uitofp i64 %2 to half ret half %3 @@ -1306,6 +2116,12 @@ define float @ucvtf_i64_f32(ptr %0) { ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: ucvtf s0, x8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_i64_f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr x8, [x0] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ret %2 = load i64, ptr %0, align 64 %3 = uitofp i64 %2 to float ret float %3 @@ -1317,6 +2133,12 @@ define double @ucvtf_i64_f64(ptr %0) { ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: ucvtf d0, x8 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ucvtf_i64_f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr x8, [x0] +; NONEON-NOSVE-NEXT: ucvtf d0, x8 +; NONEON-NOSVE-NEXT: ret %2 = load i64, ptr %0, align 64 %3 = uitofp i64 %2 to double ret double %3 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 3775a64a89a0cb..250929df6b3c35 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -18,6 +19,13 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) { ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 +; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2 ret <4 x i8> %sel } @@ -36,6 +44,13 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) { ; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v2.8b, v2.8b, #7 +; NONEON-NOSVE-NEXT: cmlt v2.8b, v2.8b, #0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2 ret <8 x i8> %sel } @@ -54,6 +69,13 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) ; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v2.16b, v2.16b, #7 +; NONEON-NOSVE-NEXT: cmlt v2.16b, v2.16b, #0 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2 ret <16 x i8> %sel } @@ -70,6 +92,18 @@ define void @select_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sel z1.b, p0, z2.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: cmeq v4.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: cmeq v5.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %mask = icmp eq <32 x i8> %op1, %op2 @@ -92,6 +126,13 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) { ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v2.2s, v2.2s, #31 +; NONEON-NOSVE-NEXT: cmlt v2.2s, v2.2s, #0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2 ret <2 x i16> %sel } @@ -110,6 +151,13 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) { ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 +; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2 ret <4 x i16> %sel } @@ -129,6 +177,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 +; NONEON-NOSVE-NEXT: shl v2.8h, v2.8h, #15 +; NONEON-NOSVE-NEXT: cmlt v2.8h, v2.8h, #0 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2 ret <8 x i16> %sel } @@ -145,6 +201,18 @@ define void @select_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: cmeq v4.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: cmeq v5.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %mask = icmp eq <16 x i16> %op1, %op2 @@ -167,6 +235,13 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) { ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v2.2s, v2.2s, #31 +; NONEON-NOSVE-NEXT: cmlt v2.2s, v2.2s, #0 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2 ret <2 x i32> %sel } @@ -186,6 +261,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) { ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 +; NONEON-NOSVE-NEXT: shl v2.4s, v2.4s, #31 +; NONEON-NOSVE-NEXT: cmlt v2.4s, v2.4s, #0 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2 ret <4 x i32> %sel } @@ -202,6 +285,18 @@ define void @select_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: cmeq v4.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: cmeq v5.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %mask = icmp eq <8 x i32> %op1, %op2 @@ -223,6 +318,14 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) { ; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: tst w0, #0x1 +; NONEON-NOSVE-NEXT: csetm x8, ne +; NONEON-NOSVE-NEXT: fmov d2, x8 +; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ret %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2 ret <1 x i64> %sel } @@ -242,6 +345,14 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) { ; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 +; NONEON-NOSVE-NEXT: shl v2.2d, v2.2d, #63 +; NONEON-NOSVE-NEXT: cmlt v2.2d, v2.2d, #0 +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2 ret <2 x i64> %sel } @@ -258,6 +369,18 @@ define void @select_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: select_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: cmeq v4.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: cmeq v5.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b +; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b +; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %mask = icmp eq <4 x i64> %op1, %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll index 918f0ccc0cf6a0..42c439ca4b38d4 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -18,6 +19,19 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) { ; CHECK-NEXT: stp q2, q5, [x0, #32] ; CHECK-NEXT: stp q1, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q3, q4, [x0] +; NONEON-NOSVE-NEXT: add v2.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: add v5.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: dup v0.4s, v1.s[2] +; NONEON-NOSVE-NEXT: add v1.4s, v3.4s, v3.4s +; NONEON-NOSVE-NEXT: add v3.4s, v4.4s, v4.4s +; NONEON-NOSVE-NEXT: stp q2, q5, [x0, #32] +; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: ret entry: %0 = load <16 x i32>, ptr %arg1, align 256 %1 = load <16 x i32>, ptr %arg2, align 256 @@ -42,6 +56,19 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) { ; CHECK-NEXT: stp q3, q4, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test2: +; NONEON-NOSVE: // %bb.0: // %entry +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q3, q4, [x0] +; NONEON-NOSVE-NEXT: add v2.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: dup v0.2s, v1.s[2] +; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: add v3.4s, v3.4s, v3.4s +; NONEON-NOSVE-NEXT: add v4.4s, v4.4s, v4.4s +; NONEON-NOSVE-NEXT: stp q2, q1, [x0, #32] +; NONEON-NOSVE-NEXT: stp q3, q4, [x0] +; NONEON-NOSVE-NEXT: ret entry: %0 = load <16 x i32>, ptr %arg1, align 256 %1 = load <16 x i32>, ptr %arg2, align 256 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll index 8c69d5b0bb375d..992b667a2eafe1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -11,6 +12,13 @@ define <4 x i8> @load_v4i8(ptr %a) { ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %load = load <4 x i8>, ptr %a ret <4 x i8> %load } @@ -20,6 +28,11 @@ define <8 x i8> @load_v8i8(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <8 x i8>, ptr %a ret <8 x i8> %load } @@ -29,6 +42,11 @@ define <16 x i8> @load_v16i8(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <16 x i8>, ptr %a ret <16 x i8> %load } @@ -38,6 +56,11 @@ define <32 x i8> @load_v32i8(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <32 x i8>, ptr %a ret <32 x i8> %load } @@ -49,6 +72,15 @@ define <2 x i16> @load_v2i16(ptr %a) { ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldrh w8, [x0] +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: add x8, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %load = load <2 x i16>, ptr %a ret <2 x i16> %load } @@ -58,6 +90,11 @@ define <2 x half> @load_v2f16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <2 x half>, ptr %a ret <2 x half> %load } @@ -67,6 +104,11 @@ define <4 x i16> @load_v4i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <4 x i16>, ptr %a ret <4 x i16> %load } @@ -76,6 +118,11 @@ define <4 x half> @load_v4f16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <4 x half>, ptr %a ret <4 x half> %load } @@ -85,6 +132,11 @@ define <8 x i16> @load_v8i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <8 x i16>, ptr %a ret <8 x i16> %load } @@ -94,6 +146,11 @@ define <8 x half> @load_v8f16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <8 x half>, ptr %a ret <8 x half> %load } @@ -103,6 +160,11 @@ define <16 x i16> @load_v16i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <16 x i16>, ptr %a ret <16 x i16> %load } @@ -112,6 +174,11 @@ define <16 x half> @load_v16f16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <16 x half>, ptr %a ret <16 x half> %load } @@ -121,6 +188,11 @@ define <2 x i32> @load_v2i32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <2 x i32>, ptr %a ret <2 x i32> %load } @@ -130,6 +202,11 @@ define <2 x float> @load_v2f32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <2 x float>, ptr %a ret <2 x float> %load } @@ -139,6 +216,11 @@ define <4 x i32> @load_v4i32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <4 x i32>, ptr %a ret <4 x i32> %load } @@ -148,6 +230,11 @@ define <4 x float> @load_v4f32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <4 x float>, ptr %a ret <4 x float> %load } @@ -157,6 +244,11 @@ define <8 x i32> @load_v8i32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <8 x i32>, ptr %a ret <8 x i32> %load } @@ -166,6 +258,11 @@ define <8 x float> @load_v8f32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <8 x float>, ptr %a ret <8 x float> %load } @@ -175,6 +272,11 @@ define <1 x i64> @load_v1i64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <1 x i64>, ptr %a ret <1 x i64> %load } @@ -184,6 +286,11 @@ define <1 x double> @load_v1f64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <1 x double>, ptr %a ret <1 x double> %load } @@ -193,6 +300,11 @@ define <2 x i64> @load_v2i64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <2 x i64>, ptr %a ret <2 x i64> %load } @@ -202,6 +314,11 @@ define <2 x double> @load_v2f64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <2 x double>, ptr %a ret <2 x double> %load } @@ -211,6 +328,11 @@ define <4 x i64> @load_v4i64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <4 x i64>, ptr %a ret <4 x i64> %load } @@ -220,6 +342,11 @@ define <4 x double> @load_v4f64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: load_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %load = load <4 x double>, ptr %a ret <4 x double> %load } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll index ef52eadc5d3b09..7abe73f08dfd65 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -17,6 +18,14 @@ define i8 @andv_v4i8(<4 x i8> %a) { ; CHECK-NEXT: andv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a) ret i8 %res } @@ -29,6 +38,15 @@ define i8 @andv_v8i8(<8 x i8> %a) { ; CHECK-NEXT: andv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #16 +; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a) ret i8 %res } @@ -41,6 +59,20 @@ define i8 @andv_v16i8(<16 x i8> %a) { ; CHECK-NEXT: andv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #16 +; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a) ret i8 %res } @@ -54,6 +86,22 @@ define i8 @andv_v32i8(ptr %a) { ; CHECK-NEXT: andv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #16 +; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op) ret i8 %res @@ -67,6 +115,13 @@ define i16 @andv_v2i16(<2 x i16> %a) { ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a) ret i16 %res } @@ -79,6 +134,14 @@ define i16 @andv_v4i16(<4 x i16> %a) { ; CHECK-NEXT: andv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a) ret i16 %res } @@ -91,6 +154,19 @@ define i16 @andv_v8i16(<8 x i16> %a) { ; CHECK-NEXT: andv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a) ret i16 %res } @@ -104,6 +180,21 @@ define i16 @andv_v16i16(ptr %a) { ; CHECK-NEXT: andv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op) ret i16 %res @@ -117,6 +208,13 @@ define i32 @andv_v2i32(<2 x i32> %a) { ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a) ret i32 %res } @@ -129,6 +227,18 @@ define i32 @andv_v4i32(<4 x i32> %a) { ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) ret i32 %res } @@ -142,6 +252,20 @@ define i32 @andv_v8i32(ptr %a) { ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op) ret i32 %res @@ -155,6 +279,16 @@ define i64 @andv_v2i64(<2 x i64> %a) { ; CHECK-NEXT: andv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a) ret i64 %res } @@ -168,6 +302,18 @@ define i64 @andv_v4i64(ptr %a) { ; CHECK-NEXT: andv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: andv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op) ret i64 %res @@ -185,6 +331,14 @@ define i8 @eorv_v4i8(<4 x i8> %a) { ; CHECK-NEXT: eorv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a) ret i8 %res } @@ -197,6 +351,15 @@ define i8 @eorv_v8i8(<8 x i8> %a) { ; CHECK-NEXT: eorv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #16 +; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a) ret i8 %res } @@ -209,6 +372,20 @@ define i8 @eorv_v16i8(<16 x i8> %a) { ; CHECK-NEXT: eorv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #16 +; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a) ret i8 %res } @@ -222,6 +399,22 @@ define i8 @eorv_v32i8(ptr %a) { ; CHECK-NEXT: eorv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #16 +; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op) ret i8 %res @@ -235,6 +428,13 @@ define i16 @eorv_v2i16(<2 x i16> %a) { ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a) ret i16 %res } @@ -247,6 +447,14 @@ define i16 @eorv_v4i16(<4 x i16> %a) { ; CHECK-NEXT: eorv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a) ret i16 %res } @@ -259,6 +467,19 @@ define i16 @eorv_v8i16(<8 x i16> %a) { ; CHECK-NEXT: eorv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a) ret i16 %res } @@ -272,6 +493,21 @@ define i16 @eorv_v16i16(ptr %a) { ; CHECK-NEXT: eorv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op) ret i16 %res @@ -285,6 +521,13 @@ define i32 @eorv_v2i32(<2 x i32> %a) { ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a) ret i32 %res } @@ -297,6 +540,18 @@ define i32 @eorv_v4i32(<4 x i32> %a) { ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) ret i32 %res } @@ -310,6 +565,20 @@ define i32 @eorv_v8i32(ptr %a) { ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op) ret i32 %res @@ -323,6 +592,16 @@ define i64 @eorv_v2i64(<2 x i64> %a) { ; CHECK-NEXT: eorv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a) ret i64 %res } @@ -336,6 +615,18 @@ define i64 @eorv_v4i64(ptr %a) { ; CHECK-NEXT: eorv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: eorv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op) ret i64 %res @@ -353,6 +644,14 @@ define i8 @orv_v4i8(<4 x i8> %a) { ; CHECK-NEXT: orv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a) ret i8 %res } @@ -365,6 +664,15 @@ define i8 @orv_v8i8(<8 x i8> %a) { ; CHECK-NEXT: orv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #16 +; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a) ret i8 %res } @@ -377,6 +685,20 @@ define i8 @orv_v16i8(<16 x i8> %a) { ; CHECK-NEXT: orv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #16 +; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a) ret i8 %res } @@ -390,6 +712,22 @@ define i8 @orv_v32i8(ptr %a) { ; CHECK-NEXT: orv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #16 +; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op) ret i8 %res @@ -403,6 +741,13 @@ define i16 @orv_v2i16(<2 x i16> %a) { ; CHECK-NEXT: orv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a) ret i16 %res } @@ -415,6 +760,14 @@ define i16 @orv_v4i16(<4 x i16> %a) { ; CHECK-NEXT: orv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a) ret i16 %res } @@ -427,6 +780,19 @@ define i16 @orv_v8i16(<8 x i16> %a) { ; CHECK-NEXT: orv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a) ret i16 %res } @@ -440,6 +806,21 @@ define i16 @orv_v16i16(ptr %a) { ; CHECK-NEXT: orv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 +; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op) ret i16 %res @@ -453,6 +834,13 @@ define i32 @orv_v2i32(<2 x i32> %a) { ; CHECK-NEXT: orv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a) ret i32 %res } @@ -465,6 +853,18 @@ define i32 @orv_v4i32(<4 x i32> %a) { ; CHECK-NEXT: orv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) ret i32 %res } @@ -478,6 +878,20 @@ define i32 @orv_v8i32(ptr %a) { ; CHECK-NEXT: orv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op) ret i32 %res @@ -491,6 +905,16 @@ define i64 @orv_v2i64(<2 x i64> %a) { ; CHECK-NEXT: orv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a) ret i64 %res } @@ -504,6 +928,18 @@ define i64 @orv_v4i64(ptr %a) { ; CHECK-NEXT: orv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: orv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] +; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op) ret i64 %res diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index 4f8f8c2e4b244a..6c33613f8e757d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -19,6 +20,44 @@ define <4 x i8> @masked_load_v4i8(ptr %src, <4 x i1> %mask) { ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI0_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB0_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[0], [x0] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB0_3 +; NONEON-NOSVE-NEXT: b .LBB0_4 +; NONEON-NOSVE-NEXT: .LBB0_2: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB0_4 +; NONEON-NOSVE-NEXT: .LBB0_3: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #1 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] +; NONEON-NOSVE-NEXT: .LBB0_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB0_7 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB0_8 +; NONEON-NOSVE-NEXT: .LBB0_6: // %else8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB0_7: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB0_6 +; NONEON-NOSVE-NEXT: .LBB0_8: // %cond.load7 +; NONEON-NOSVE-NEXT: add x8, x0, #3 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x8] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %load = call <4 x i8> @llvm.masked.load.v4i8(ptr %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer) ret <4 x i8> %load } @@ -34,6 +73,67 @@ define <8 x i8> @masked_load_v8i8(ptr %src, <8 x i1> %mask) { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI1_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] +; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv b0, v0.8b +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB1_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldr b0, [x0] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB1_3 +; NONEON-NOSVE-NEXT: b .LBB1_4 +; NONEON-NOSVE-NEXT: .LBB1_2: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB1_4 +; NONEON-NOSVE-NEXT: .LBB1_3: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #1 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[1], [x9] +; NONEON-NOSVE-NEXT: .LBB1_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB1_11 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB1_12 +; NONEON-NOSVE-NEXT: .LBB1_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB1_13 +; NONEON-NOSVE-NEXT: .LBB1_7: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB1_14 +; NONEON-NOSVE-NEXT: .LBB1_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB1_15 +; NONEON-NOSVE-NEXT: .LBB1_9: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB1_16 +; NONEON-NOSVE-NEXT: .LBB1_10: // %else20 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB1_11: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB1_6 +; NONEON-NOSVE-NEXT: .LBB1_12: // %cond.load7 +; NONEON-NOSVE-NEXT: add x9, x0, #3 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[3], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB1_7 +; NONEON-NOSVE-NEXT: .LBB1_13: // %cond.load10 +; NONEON-NOSVE-NEXT: add x9, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB1_8 +; NONEON-NOSVE-NEXT: .LBB1_14: // %cond.load13 +; NONEON-NOSVE-NEXT: add x9, x0, #5 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[5], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB1_9 +; NONEON-NOSVE-NEXT: .LBB1_15: // %cond.load16 +; NONEON-NOSVE-NEXT: add x9, x0, #6 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB1_10 +; NONEON-NOSVE-NEXT: .LBB1_16: // %cond.load19 +; NONEON-NOSVE-NEXT: add x8, x0, #7 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[7], [x8] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer) ret <8 x i8> %load } @@ -49,6 +149,115 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 +; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: addv h1, v0.8h +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB2_17 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_18 +; NONEON-NOSVE-NEXT: .LBB2_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB2_19 +; NONEON-NOSVE-NEXT: .LBB2_3: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB2_20 +; NONEON-NOSVE-NEXT: .LBB2_4: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB2_21 +; NONEON-NOSVE-NEXT: .LBB2_5: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB2_22 +; NONEON-NOSVE-NEXT: .LBB2_6: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB2_23 +; NONEON-NOSVE-NEXT: .LBB2_7: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB2_24 +; NONEON-NOSVE-NEXT: .LBB2_8: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB2_25 +; NONEON-NOSVE-NEXT: .LBB2_9: // %else23 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB2_26 +; NONEON-NOSVE-NEXT: .LBB2_10: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB2_27 +; NONEON-NOSVE-NEXT: .LBB2_11: // %else29 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB2_28 +; NONEON-NOSVE-NEXT: .LBB2_12: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB2_29 +; NONEON-NOSVE-NEXT: .LBB2_13: // %else35 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB2_30 +; NONEON-NOSVE-NEXT: .LBB2_14: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB2_31 +; NONEON-NOSVE-NEXT: .LBB2_15: // %else41 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_32 +; NONEON-NOSVE-NEXT: .LBB2_16: // %else44 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB2_17: // %cond.load +; NONEON-NOSVE-NEXT: ldr b0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB2_2 +; NONEON-NOSVE-NEXT: .LBB2_18: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #1 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB2_3 +; NONEON-NOSVE-NEXT: .LBB2_19: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB2_4 +; NONEON-NOSVE-NEXT: .LBB2_20: // %cond.load7 +; NONEON-NOSVE-NEXT: add x9, x0, #3 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[3], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB2_5 +; NONEON-NOSVE-NEXT: .LBB2_21: // %cond.load10 +; NONEON-NOSVE-NEXT: add x9, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB2_6 +; NONEON-NOSVE-NEXT: .LBB2_22: // %cond.load13 +; NONEON-NOSVE-NEXT: add x9, x0, #5 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[5], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB2_7 +; NONEON-NOSVE-NEXT: .LBB2_23: // %cond.load16 +; NONEON-NOSVE-NEXT: add x9, x0, #6 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB2_8 +; NONEON-NOSVE-NEXT: .LBB2_24: // %cond.load19 +; NONEON-NOSVE-NEXT: add x9, x0, #7 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[7], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB2_9 +; NONEON-NOSVE-NEXT: .LBB2_25: // %cond.load22 +; NONEON-NOSVE-NEXT: add x9, x0, #8 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[8], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB2_10 +; NONEON-NOSVE-NEXT: .LBB2_26: // %cond.load25 +; NONEON-NOSVE-NEXT: add x9, x0, #9 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[9], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB2_11 +; NONEON-NOSVE-NEXT: .LBB2_27: // %cond.load28 +; NONEON-NOSVE-NEXT: add x9, x0, #10 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[10], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB2_12 +; NONEON-NOSVE-NEXT: .LBB2_28: // %cond.load31 +; NONEON-NOSVE-NEXT: add x9, x0, #11 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[11], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB2_13 +; NONEON-NOSVE-NEXT: .LBB2_29: // %cond.load34 +; NONEON-NOSVE-NEXT: add x9, x0, #12 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[12], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB2_14 +; NONEON-NOSVE-NEXT: .LBB2_30: // %cond.load37 +; NONEON-NOSVE-NEXT: add x9, x0, #13 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[13], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB2_15 +; NONEON-NOSVE-NEXT: .LBB2_31: // %cond.load40 +; NONEON-NOSVE-NEXT: add x9, x0, #14 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[14], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_16 +; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.load43 +; NONEON-NOSVE-NEXT: add x8, x0, #15 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[15], [x8] +; NONEON-NOSVE-NEXT: ret %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer) ret <16 x i8> %load } @@ -130,6 +339,277 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) { ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] +; NONEON-NOSVE-NEXT: fmov s1, w1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] +; NONEON-NOSVE-NEXT: mov v1.b[1], w2 +; NONEON-NOSVE-NEXT: mov v0.b[1], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: mov v1.b[2], w3 +; NONEON-NOSVE-NEXT: mov v0.b[2], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: mov v1.b[3], w4 +; NONEON-NOSVE-NEXT: mov v0.b[3], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] +; NONEON-NOSVE-NEXT: mov v1.b[4], w5 +; NONEON-NOSVE-NEXT: mov v0.b[4], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: mov v1.b[5], w6 +; NONEON-NOSVE-NEXT: mov v0.b[5], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] +; NONEON-NOSVE-NEXT: mov v1.b[6], w7 +; NONEON-NOSVE-NEXT: mov v0.b[6], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: mov v1.b[7], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: mov v0.b[7], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] +; NONEON-NOSVE-NEXT: mov v1.b[8], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: mov v0.b[8], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: mov v1.b[9], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: mov v0.b[9], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] +; NONEON-NOSVE-NEXT: mov v1.b[10], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: mov v0.b[10], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: mov v1.b[11], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] +; NONEON-NOSVE-NEXT: mov v0.b[11], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #168] +; NONEON-NOSVE-NEXT: mov v1.b[12], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: mov v0.b[12], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: mov v1.b[13], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #56] +; NONEON-NOSVE-NEXT: mov v0.b[13], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] +; NONEON-NOSVE-NEXT: mov v1.b[14], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #64] +; NONEON-NOSVE-NEXT: mov v0.b[14], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: mov v1.b[15], w9 +; NONEON-NOSVE-NEXT: mov v0.b[15], w8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 +; NONEON-NOSVE-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] +; NONEON-NOSVE-NEXT: shl v1.16b, v1.16b, #7 +; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 +; NONEON-NOSVE-NEXT: cmlt v1.16b, v1.16b, #0 +; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 +; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: zip1 v1.16b, v1.16b, v3.16b +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: addv h1, v1.8h +; NONEON-NOSVE-NEXT: addv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w9, s0 +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: bfi w8, w9, #16, #16 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_33 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_34 +; NONEON-NOSVE-NEXT: .LBB3_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_35 +; NONEON-NOSVE-NEXT: .LBB3_3: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_36 +; NONEON-NOSVE-NEXT: .LBB3_4: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_37 +; NONEON-NOSVE-NEXT: .LBB3_5: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_38 +; NONEON-NOSVE-NEXT: .LBB3_6: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_39 +; NONEON-NOSVE-NEXT: .LBB3_7: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_40 +; NONEON-NOSVE-NEXT: .LBB3_8: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_41 +; NONEON-NOSVE-NEXT: .LBB3_9: // %else23 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_42 +; NONEON-NOSVE-NEXT: .LBB3_10: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_43 +; NONEON-NOSVE-NEXT: .LBB3_11: // %else29 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_44 +; NONEON-NOSVE-NEXT: .LBB3_12: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_45 +; NONEON-NOSVE-NEXT: .LBB3_13: // %else35 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_46 +; NONEON-NOSVE-NEXT: .LBB3_14: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_47 +; NONEON-NOSVE-NEXT: .LBB3_15: // %else41 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_48 +; NONEON-NOSVE-NEXT: .LBB3_16: // %else44 +; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_49 +; NONEON-NOSVE-NEXT: .LBB3_17: // %else47 +; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_50 +; NONEON-NOSVE-NEXT: .LBB3_18: // %else50 +; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_51 +; NONEON-NOSVE-NEXT: .LBB3_19: // %else53 +; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_52 +; NONEON-NOSVE-NEXT: .LBB3_20: // %else56 +; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_53 +; NONEON-NOSVE-NEXT: .LBB3_21: // %else59 +; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_54 +; NONEON-NOSVE-NEXT: .LBB3_22: // %else62 +; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_55 +; NONEON-NOSVE-NEXT: .LBB3_23: // %else65 +; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_56 +; NONEON-NOSVE-NEXT: .LBB3_24: // %else68 +; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_57 +; NONEON-NOSVE-NEXT: .LBB3_25: // %else71 +; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_58 +; NONEON-NOSVE-NEXT: .LBB3_26: // %else74 +; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_59 +; NONEON-NOSVE-NEXT: .LBB3_27: // %else77 +; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_60 +; NONEON-NOSVE-NEXT: .LBB3_28: // %else80 +; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_61 +; NONEON-NOSVE-NEXT: .LBB3_29: // %else83 +; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_62 +; NONEON-NOSVE-NEXT: .LBB3_30: // %else86 +; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_63 +; NONEON-NOSVE-NEXT: .LBB3_31: // %else89 +; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_64 +; NONEON-NOSVE-NEXT: .LBB3_32: // %else92 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB3_33: // %cond.load +; NONEON-NOSVE-NEXT: ldr b0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_2 +; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #1 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB3_3 +; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_4 +; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.load7 +; NONEON-NOSVE-NEXT: add x9, x0, #3 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[3], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_5 +; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.load10 +; NONEON-NOSVE-NEXT: add x9, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_6 +; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.load13 +; NONEON-NOSVE-NEXT: add x9, x0, #5 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[5], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_7 +; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.load16 +; NONEON-NOSVE-NEXT: add x9, x0, #6 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_8 +; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.load19 +; NONEON-NOSVE-NEXT: add x9, x0, #7 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[7], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_9 +; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.load22 +; NONEON-NOSVE-NEXT: add x9, x0, #8 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[8], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_10 +; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.load25 +; NONEON-NOSVE-NEXT: add x9, x0, #9 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[9], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_11 +; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.load28 +; NONEON-NOSVE-NEXT: add x9, x0, #10 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[10], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_12 +; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.load31 +; NONEON-NOSVE-NEXT: add x9, x0, #11 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[11], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_13 +; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.load34 +; NONEON-NOSVE-NEXT: add x9, x0, #12 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[12], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_14 +; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.load37 +; NONEON-NOSVE-NEXT: add x9, x0, #13 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[13], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_15 +; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.load40 +; NONEON-NOSVE-NEXT: add x9, x0, #14 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[14], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_16 +; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.load43 +; NONEON-NOSVE-NEXT: add x9, x0, #15 +; NONEON-NOSVE-NEXT: ld1 { v0.b }[15], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_17 +; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.load46 +; NONEON-NOSVE-NEXT: add x9, x0, #16 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[0], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_18 +; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.load49 +; NONEON-NOSVE-NEXT: add x9, x0, #17 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_19 +; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.load52 +; NONEON-NOSVE-NEXT: add x9, x0, #18 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_20 +; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.load55 +; NONEON-NOSVE-NEXT: add x9, x0, #19 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[3], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_21 +; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.load58 +; NONEON-NOSVE-NEXT: add x9, x0, #20 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[4], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_22 +; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.load61 +; NONEON-NOSVE-NEXT: add x9, x0, #21 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[5], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_23 +; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.load64 +; NONEON-NOSVE-NEXT: add x9, x0, #22 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[6], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_24 +; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.load67 +; NONEON-NOSVE-NEXT: add x9, x0, #23 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[7], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_25 +; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.load70 +; NONEON-NOSVE-NEXT: add x9, x0, #24 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[8], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_26 +; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.load73 +; NONEON-NOSVE-NEXT: add x9, x0, #25 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[9], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_27 +; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.load76 +; NONEON-NOSVE-NEXT: add x9, x0, #26 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[10], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_28 +; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.load79 +; NONEON-NOSVE-NEXT: add x9, x0, #27 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[11], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_29 +; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.load82 +; NONEON-NOSVE-NEXT: add x9, x0, #28 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[12], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_30 +; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.load85 +; NONEON-NOSVE-NEXT: add x9, x0, #29 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[13], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_31 +; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.load88 +; NONEON-NOSVE-NEXT: add x9, x0, #30 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[14], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_32 +; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.load91 +; NONEON-NOSVE-NEXT: add x8, x0, #31 +; NONEON-NOSVE-NEXT: ld1 { v1.b }[15], [x8] +; NONEON-NOSVE-NEXT: ret %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer) ret <32 x i8> %load } @@ -155,6 +635,31 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI4_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] +; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addp v1.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB4_3 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB4_4 +; NONEON-NOSVE-NEXT: .LBB4_2: // %else2 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB4_3: // %cond.load +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB4_2 +; NONEON-NOSVE-NEXT: .LBB4_4: // %cond.load1 +; NONEON-NOSVE-NEXT: add x8, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x8] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %load = call <2 x half> @llvm.masked.load.v2f16(ptr %src, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer) ret <2 x half> %load } @@ -170,6 +675,43 @@ define <4 x half> @masked_load_v4f16(ptr %src, <4 x i1> %mask) { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI5_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI5_0] +; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv h1, v0.4h +; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB5_5 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB5_6 +; NONEON-NOSVE-NEXT: .LBB5_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB5_7 +; NONEON-NOSVE-NEXT: .LBB5_3: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB5_8 +; NONEON-NOSVE-NEXT: .LBB5_4: // %else8 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB5_5: // %cond.load +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB5_2 +; NONEON-NOSVE-NEXT: .LBB5_6: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB5_3 +; NONEON-NOSVE-NEXT: .LBB5_7: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB5_4 +; NONEON-NOSVE-NEXT: .LBB5_8: // %cond.load7 +; NONEON-NOSVE-NEXT: add x8, x0, #6 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x8] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %load = call <4 x half> @llvm.masked.load.v4f16(ptr %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer) ret <4 x half> %load } @@ -186,6 +728,65 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI6_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI6_0] +; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv b1, v0.8b +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB6_9 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB6_10 +; NONEON-NOSVE-NEXT: .LBB6_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB6_11 +; NONEON-NOSVE-NEXT: .LBB6_3: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB6_12 +; NONEON-NOSVE-NEXT: .LBB6_4: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB6_13 +; NONEON-NOSVE-NEXT: .LBB6_5: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB6_14 +; NONEON-NOSVE-NEXT: .LBB6_6: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB6_15 +; NONEON-NOSVE-NEXT: .LBB6_7: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB6_16 +; NONEON-NOSVE-NEXT: .LBB6_8: // %else20 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB6_9: // %cond.load +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB6_2 +; NONEON-NOSVE-NEXT: .LBB6_10: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB6_3 +; NONEON-NOSVE-NEXT: .LBB6_11: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB6_4 +; NONEON-NOSVE-NEXT: .LBB6_12: // %cond.load7 +; NONEON-NOSVE-NEXT: add x9, x0, #6 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB6_5 +; NONEON-NOSVE-NEXT: .LBB6_13: // %cond.load10 +; NONEON-NOSVE-NEXT: add x9, x0, #8 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[4], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB6_6 +; NONEON-NOSVE-NEXT: .LBB6_14: // %cond.load13 +; NONEON-NOSVE-NEXT: add x9, x0, #10 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[5], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB6_7 +; NONEON-NOSVE-NEXT: .LBB6_15: // %cond.load16 +; NONEON-NOSVE-NEXT: add x9, x0, #12 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[6], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB6_8 +; NONEON-NOSVE-NEXT: .LBB6_16: // %cond.load19 +; NONEON-NOSVE-NEXT: add x8, x0, #14 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[7], [x8] +; NONEON-NOSVE-NEXT: ret %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer) ret <8 x half> %load } @@ -210,6 +811,116 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) { ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI7_0 +; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] +; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: addv h2, v0.8h +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s2 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB7_17 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_18 +; NONEON-NOSVE-NEXT: .LBB7_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB7_19 +; NONEON-NOSVE-NEXT: .LBB7_3: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB7_20 +; NONEON-NOSVE-NEXT: .LBB7_4: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB7_21 +; NONEON-NOSVE-NEXT: .LBB7_5: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB7_22 +; NONEON-NOSVE-NEXT: .LBB7_6: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB7_23 +; NONEON-NOSVE-NEXT: .LBB7_7: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB7_24 +; NONEON-NOSVE-NEXT: .LBB7_8: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB7_25 +; NONEON-NOSVE-NEXT: .LBB7_9: // %else23 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB7_26 +; NONEON-NOSVE-NEXT: .LBB7_10: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB7_27 +; NONEON-NOSVE-NEXT: .LBB7_11: // %else29 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB7_28 +; NONEON-NOSVE-NEXT: .LBB7_12: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB7_29 +; NONEON-NOSVE-NEXT: .LBB7_13: // %else35 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB7_30 +; NONEON-NOSVE-NEXT: .LBB7_14: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB7_31 +; NONEON-NOSVE-NEXT: .LBB7_15: // %else41 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_32 +; NONEON-NOSVE-NEXT: .LBB7_16: // %else44 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB7_17: // %cond.load +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB7_2 +; NONEON-NOSVE-NEXT: .LBB7_18: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB7_3 +; NONEON-NOSVE-NEXT: .LBB7_19: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB7_4 +; NONEON-NOSVE-NEXT: .LBB7_20: // %cond.load7 +; NONEON-NOSVE-NEXT: add x9, x0, #6 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB7_5 +; NONEON-NOSVE-NEXT: .LBB7_21: // %cond.load10 +; NONEON-NOSVE-NEXT: add x9, x0, #8 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[4], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB7_6 +; NONEON-NOSVE-NEXT: .LBB7_22: // %cond.load13 +; NONEON-NOSVE-NEXT: add x9, x0, #10 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[5], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB7_7 +; NONEON-NOSVE-NEXT: .LBB7_23: // %cond.load16 +; NONEON-NOSVE-NEXT: add x9, x0, #12 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[6], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB7_8 +; NONEON-NOSVE-NEXT: .LBB7_24: // %cond.load19 +; NONEON-NOSVE-NEXT: add x9, x0, #14 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[7], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB7_9 +; NONEON-NOSVE-NEXT: .LBB7_25: // %cond.load22 +; NONEON-NOSVE-NEXT: add x9, x0, #16 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[0], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB7_10 +; NONEON-NOSVE-NEXT: .LBB7_26: // %cond.load25 +; NONEON-NOSVE-NEXT: add x9, x0, #18 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB7_11 +; NONEON-NOSVE-NEXT: .LBB7_27: // %cond.load28 +; NONEON-NOSVE-NEXT: add x9, x0, #20 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB7_12 +; NONEON-NOSVE-NEXT: .LBB7_28: // %cond.load31 +; NONEON-NOSVE-NEXT: add x9, x0, #22 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[3], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB7_13 +; NONEON-NOSVE-NEXT: .LBB7_29: // %cond.load34 +; NONEON-NOSVE-NEXT: add x9, x0, #24 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[4], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB7_14 +; NONEON-NOSVE-NEXT: .LBB7_30: // %cond.load37 +; NONEON-NOSVE-NEXT: add x9, x0, #26 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[5], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB7_15 +; NONEON-NOSVE-NEXT: .LBB7_31: // %cond.load40 +; NONEON-NOSVE-NEXT: add x9, x0, #28 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[6], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB7_16 +; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.load43 +; NONEON-NOSVE-NEXT: add x8, x0, #30 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[7], [x8] +; NONEON-NOSVE-NEXT: ret %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer) ret <16 x half> %load } @@ -225,6 +936,31 @@ define <2 x float> @masked_load_v2f32(ptr %src, <2 x i1> %mask) { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI8_0] +; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addp v1.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB8_3 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB8_4 +; NONEON-NOSVE-NEXT: .LBB8_2: // %else2 +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB8_3: // %cond.load +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB8_2 +; NONEON-NOSVE-NEXT: .LBB8_4: // %cond.load1 +; NONEON-NOSVE-NEXT: add x8, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.s }[1], [x8] +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ret %load = call <2 x float> @llvm.masked.load.v2f32(ptr %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer) ret <2 x float> %load } @@ -241,6 +977,41 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI9_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI9_0] +; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv h1, v0.4h +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB9_5 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB9_6 +; NONEON-NOSVE-NEXT: .LBB9_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB9_7 +; NONEON-NOSVE-NEXT: .LBB9_3: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB9_8 +; NONEON-NOSVE-NEXT: .LBB9_4: // %else8 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB9_5: // %cond.load +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB9_2 +; NONEON-NOSVE-NEXT: .LBB9_6: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.s }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB9_3 +; NONEON-NOSVE-NEXT: .LBB9_7: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #8 +; NONEON-NOSVE-NEXT: ld1 { v0.s }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB9_4 +; NONEON-NOSVE-NEXT: .LBB9_8: // %cond.load7 +; NONEON-NOSVE-NEXT: add x8, x0, #12 +; NONEON-NOSVE-NEXT: ld1 { v0.s }[3], [x8] +; NONEON-NOSVE-NEXT: ret %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer) ret <4 x float> %load } @@ -290,6 +1061,66 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) { ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI10_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] +; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: addv b2, v0.8b +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s2 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB10_9 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB10_10 +; NONEON-NOSVE-NEXT: .LBB10_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB10_11 +; NONEON-NOSVE-NEXT: .LBB10_3: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB10_12 +; NONEON-NOSVE-NEXT: .LBB10_4: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB10_13 +; NONEON-NOSVE-NEXT: .LBB10_5: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB10_14 +; NONEON-NOSVE-NEXT: .LBB10_6: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB10_15 +; NONEON-NOSVE-NEXT: .LBB10_7: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB10_16 +; NONEON-NOSVE-NEXT: .LBB10_8: // %else20 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB10_9: // %cond.load +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB10_2 +; NONEON-NOSVE-NEXT: .LBB10_10: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.s }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB10_3 +; NONEON-NOSVE-NEXT: .LBB10_11: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #8 +; NONEON-NOSVE-NEXT: ld1 { v0.s }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB10_4 +; NONEON-NOSVE-NEXT: .LBB10_12: // %cond.load7 +; NONEON-NOSVE-NEXT: add x9, x0, #12 +; NONEON-NOSVE-NEXT: ld1 { v0.s }[3], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB10_5 +; NONEON-NOSVE-NEXT: .LBB10_13: // %cond.load10 +; NONEON-NOSVE-NEXT: add x9, x0, #16 +; NONEON-NOSVE-NEXT: ld1 { v1.s }[0], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB10_6 +; NONEON-NOSVE-NEXT: .LBB10_14: // %cond.load13 +; NONEON-NOSVE-NEXT: add x9, x0, #20 +; NONEON-NOSVE-NEXT: ld1 { v1.s }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB10_7 +; NONEON-NOSVE-NEXT: .LBB10_15: // %cond.load16 +; NONEON-NOSVE-NEXT: add x9, x0, #24 +; NONEON-NOSVE-NEXT: ld1 { v1.s }[2], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB10_8 +; NONEON-NOSVE-NEXT: .LBB10_16: // %cond.load19 +; NONEON-NOSVE-NEXT: add x8, x0, #28 +; NONEON-NOSVE-NEXT: ld1 { v1.s }[3], [x8] +; NONEON-NOSVE-NEXT: ret %load = call <8 x float> @llvm.masked.load.v8f32(ptr %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer) ret <8 x float> %load } @@ -306,6 +1137,29 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI11_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI11_0] +; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addp v1.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB11_3 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB11_4 +; NONEON-NOSVE-NEXT: .LBB11_2: // %else2 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB11_3: // %cond.load +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB11_2 +; NONEON-NOSVE-NEXT: .LBB11_4: // %cond.load1 +; NONEON-NOSVE-NEXT: add x8, x0, #8 +; NONEON-NOSVE-NEXT: ld1 { v0.d }[1], [x8] +; NONEON-NOSVE-NEXT: ret %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer) ret <2 x double> %load } @@ -331,6 +1185,42 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) { ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI12_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI12_0] +; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: addv h2, v0.4h +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: fmov w8, s2 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB12_5 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB12_6 +; NONEON-NOSVE-NEXT: .LBB12_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB12_7 +; NONEON-NOSVE-NEXT: .LBB12_3: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB12_8 +; NONEON-NOSVE-NEXT: .LBB12_4: // %else8 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB12_5: // %cond.load +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB12_2 +; NONEON-NOSVE-NEXT: .LBB12_6: // %cond.load1 +; NONEON-NOSVE-NEXT: add x9, x0, #8 +; NONEON-NOSVE-NEXT: ld1 { v0.d }[1], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB12_3 +; NONEON-NOSVE-NEXT: .LBB12_7: // %cond.load4 +; NONEON-NOSVE-NEXT: add x9, x0, #16 +; NONEON-NOSVE-NEXT: ld1 { v1.d }[0], [x9] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB12_4 +; NONEON-NOSVE-NEXT: .LBB12_8: // %cond.load7 +; NONEON-NOSVE-NEXT: add x8, x0, #24 +; NONEON-NOSVE-NEXT: ld1 { v1.d }[1], [x8] +; NONEON-NOSVE-NEXT: ret %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer) ret <4 x double> %load } @@ -356,6 +1246,38 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_zext_v3i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: and w8, w1, #0x1 +; NONEON-NOSVE-NEXT: bfi w8, w2, #1, #1 +; NONEON-NOSVE-NEXT: bfi w8, w3, #2, #1 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB13_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB13_3 +; NONEON-NOSVE-NEXT: b .LBB13_4 +; NONEON-NOSVE-NEXT: .LBB13_2: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB13_4 +; NONEON-NOSVE-NEXT: .LBB13_3: // %cond.load1 +; NONEON-NOSVE-NEXT: mov v1.16b, v0.16b +; NONEON-NOSVE-NEXT: add x9, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[1], [x9] +; NONEON-NOSVE-NEXT: mov v1.h[2], v0.h[2] +; NONEON-NOSVE-NEXT: fmov d0, d1 +; NONEON-NOSVE-NEXT: .LBB13_4: // %else2 +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB13_6 +; NONEON-NOSVE-NEXT: // %bb.5: // %cond.load4 +; NONEON-NOSVE-NEXT: mov v0.h[1], v0.h[1] +; NONEON-NOSVE-NEXT: add x8, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] +; NONEON-NOSVE-NEXT: .LBB13_6: // %else5 +; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer) %extend = zext <3 x i16> %load_value to <3 x i32> ret <3 x i32> %extend; @@ -382,6 +1304,38 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_load_sext_v3i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: and w8, w1, #0x1 +; NONEON-NOSVE-NEXT: bfi w8, w2, #1, #1 +; NONEON-NOSVE-NEXT: bfi w8, w3, #2, #1 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB14_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB14_3 +; NONEON-NOSVE-NEXT: b .LBB14_4 +; NONEON-NOSVE-NEXT: .LBB14_2: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB14_4 +; NONEON-NOSVE-NEXT: .LBB14_3: // %cond.load1 +; NONEON-NOSVE-NEXT: mov v1.16b, v0.16b +; NONEON-NOSVE-NEXT: add x9, x0, #2 +; NONEON-NOSVE-NEXT: ld1 { v1.h }[1], [x9] +; NONEON-NOSVE-NEXT: mov v1.h[2], v0.h[2] +; NONEON-NOSVE-NEXT: fmov d0, d1 +; NONEON-NOSVE-NEXT: .LBB14_4: // %else2 +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB14_6 +; NONEON-NOSVE-NEXT: // %bb.5: // %cond.load4 +; NONEON-NOSVE-NEXT: mov v0.h[1], v0.h[1] +; NONEON-NOSVE-NEXT: add x8, x0, #4 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] +; NONEON-NOSVE-NEXT: .LBB14_6: // %else5 +; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer) %extend = sext <3 x i16> %load_value to <3 x i32> ret <3 x i32> %extend; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index bd6b96889b4cc5..0904399558aee1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -19,6 +20,37 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) { ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: st1b { z0.h }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI0_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB0_5 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB0_6 +; NONEON-NOSVE-NEXT: .LBB0_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB0_7 +; NONEON-NOSVE-NEXT: .LBB0_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB0_8 +; NONEON-NOSVE-NEXT: .LBB0_4: // %else6 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB0_5: // %cond.store +; NONEON-NOSVE-NEXT: strb wzr, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB0_2 +; NONEON-NOSVE-NEXT: .LBB0_6: // %cond.store1 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #1] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB0_3 +; NONEON-NOSVE-NEXT: .LBB0_7: // %cond.store3 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #2] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB0_4 +; NONEON-NOSVE-NEXT: .LBB0_8: // %cond.store5 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #3] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void } @@ -34,6 +66,57 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) { ; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI1_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] +; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv b0, v0.8b +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB1_9 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB1_10 +; NONEON-NOSVE-NEXT: .LBB1_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB1_11 +; NONEON-NOSVE-NEXT: .LBB1_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB1_12 +; NONEON-NOSVE-NEXT: .LBB1_4: // %else6 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB1_13 +; NONEON-NOSVE-NEXT: .LBB1_5: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB1_14 +; NONEON-NOSVE-NEXT: .LBB1_6: // %else10 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB1_15 +; NONEON-NOSVE-NEXT: .LBB1_7: // %else12 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB1_16 +; NONEON-NOSVE-NEXT: .LBB1_8: // %else14 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB1_9: // %cond.store +; NONEON-NOSVE-NEXT: strb wzr, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB1_2 +; NONEON-NOSVE-NEXT: .LBB1_10: // %cond.store1 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #1] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB1_3 +; NONEON-NOSVE-NEXT: .LBB1_11: // %cond.store3 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #2] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB1_4 +; NONEON-NOSVE-NEXT: .LBB1_12: // %cond.store5 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #3] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB1_5 +; NONEON-NOSVE-NEXT: .LBB1_13: // %cond.store7 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #4] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB1_6 +; NONEON-NOSVE-NEXT: .LBB1_14: // %cond.store9 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #5] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB1_7 +; NONEON-NOSVE-NEXT: .LBB1_15: // %cond.store11 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #6] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB1_8 +; NONEON-NOSVE-NEXT: .LBB1_16: // %cond.store13 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #7] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void } @@ -49,6 +132,99 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) { ; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 +; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: addv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB2_17 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_18 +; NONEON-NOSVE-NEXT: .LBB2_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB2_19 +; NONEON-NOSVE-NEXT: .LBB2_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB2_20 +; NONEON-NOSVE-NEXT: .LBB2_4: // %else6 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB2_21 +; NONEON-NOSVE-NEXT: .LBB2_5: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB2_22 +; NONEON-NOSVE-NEXT: .LBB2_6: // %else10 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB2_23 +; NONEON-NOSVE-NEXT: .LBB2_7: // %else12 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB2_24 +; NONEON-NOSVE-NEXT: .LBB2_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB2_25 +; NONEON-NOSVE-NEXT: .LBB2_9: // %else16 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB2_26 +; NONEON-NOSVE-NEXT: .LBB2_10: // %else18 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB2_27 +; NONEON-NOSVE-NEXT: .LBB2_11: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB2_28 +; NONEON-NOSVE-NEXT: .LBB2_12: // %else22 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB2_29 +; NONEON-NOSVE-NEXT: .LBB2_13: // %else24 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB2_30 +; NONEON-NOSVE-NEXT: .LBB2_14: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB2_31 +; NONEON-NOSVE-NEXT: .LBB2_15: // %else28 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_32 +; NONEON-NOSVE-NEXT: .LBB2_16: // %else30 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB2_17: // %cond.store +; NONEON-NOSVE-NEXT: strb wzr, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB2_2 +; NONEON-NOSVE-NEXT: .LBB2_18: // %cond.store1 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #1] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB2_3 +; NONEON-NOSVE-NEXT: .LBB2_19: // %cond.store3 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #2] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB2_4 +; NONEON-NOSVE-NEXT: .LBB2_20: // %cond.store5 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #3] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB2_5 +; NONEON-NOSVE-NEXT: .LBB2_21: // %cond.store7 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #4] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB2_6 +; NONEON-NOSVE-NEXT: .LBB2_22: // %cond.store9 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #5] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB2_7 +; NONEON-NOSVE-NEXT: .LBB2_23: // %cond.store11 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #6] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB2_8 +; NONEON-NOSVE-NEXT: .LBB2_24: // %cond.store13 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #7] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB2_9 +; NONEON-NOSVE-NEXT: .LBB2_25: // %cond.store15 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #8] +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB2_10 +; NONEON-NOSVE-NEXT: .LBB2_26: // %cond.store17 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #9] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB2_11 +; NONEON-NOSVE-NEXT: .LBB2_27: // %cond.store19 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #10] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB2_12 +; NONEON-NOSVE-NEXT: .LBB2_28: // %cond.store21 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #11] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB2_13 +; NONEON-NOSVE-NEXT: .LBB2_29: // %cond.store23 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #12] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB2_14 +; NONEON-NOSVE-NEXT: .LBB2_30: // %cond.store25 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #13] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB2_15 +; NONEON-NOSVE-NEXT: .LBB2_31: // %cond.store27 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #14] +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_16 +; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.store29 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #15] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask) ret void } @@ -129,6 +305,244 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) { ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] +; NONEON-NOSVE-NEXT: fmov s1, w1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] +; NONEON-NOSVE-NEXT: mov v1.b[1], w2 +; NONEON-NOSVE-NEXT: mov v0.b[1], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: mov v1.b[2], w3 +; NONEON-NOSVE-NEXT: mov v0.b[2], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: mov v1.b[3], w4 +; NONEON-NOSVE-NEXT: mov v0.b[3], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] +; NONEON-NOSVE-NEXT: mov v1.b[4], w5 +; NONEON-NOSVE-NEXT: mov v0.b[4], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: mov v1.b[5], w6 +; NONEON-NOSVE-NEXT: mov v0.b[5], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] +; NONEON-NOSVE-NEXT: mov v1.b[6], w7 +; NONEON-NOSVE-NEXT: mov v0.b[6], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: mov v1.b[7], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: mov v0.b[7], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] +; NONEON-NOSVE-NEXT: mov v1.b[8], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: mov v0.b[8], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: mov v1.b[9], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: mov v0.b[9], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] +; NONEON-NOSVE-NEXT: mov v1.b[10], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: mov v0.b[10], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: mov v1.b[11], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] +; NONEON-NOSVE-NEXT: mov v0.b[11], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #168] +; NONEON-NOSVE-NEXT: mov v1.b[12], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: mov v0.b[12], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: mov v1.b[13], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #56] +; NONEON-NOSVE-NEXT: mov v0.b[13], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] +; NONEON-NOSVE-NEXT: mov v1.b[14], w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #64] +; NONEON-NOSVE-NEXT: mov v0.b[14], w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: mov v1.b[15], w9 +; NONEON-NOSVE-NEXT: mov v0.b[15], w8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 +; NONEON-NOSVE-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] +; NONEON-NOSVE-NEXT: shl v1.16b, v1.16b, #7 +; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 +; NONEON-NOSVE-NEXT: cmlt v1.16b, v1.16b, #0 +; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 +; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: zip1 v1.16b, v1.16b, v3.16b +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: addv h1, v1.8h +; NONEON-NOSVE-NEXT: addv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w8, s1 +; NONEON-NOSVE-NEXT: fmov w9, s0 +; NONEON-NOSVE-NEXT: bfi w8, w9, #16, #16 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_33 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_34 +; NONEON-NOSVE-NEXT: .LBB3_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_35 +; NONEON-NOSVE-NEXT: .LBB3_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_36 +; NONEON-NOSVE-NEXT: .LBB3_4: // %else6 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_37 +; NONEON-NOSVE-NEXT: .LBB3_5: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_38 +; NONEON-NOSVE-NEXT: .LBB3_6: // %else10 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_39 +; NONEON-NOSVE-NEXT: .LBB3_7: // %else12 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_40 +; NONEON-NOSVE-NEXT: .LBB3_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_41 +; NONEON-NOSVE-NEXT: .LBB3_9: // %else16 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_42 +; NONEON-NOSVE-NEXT: .LBB3_10: // %else18 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_43 +; NONEON-NOSVE-NEXT: .LBB3_11: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_44 +; NONEON-NOSVE-NEXT: .LBB3_12: // %else22 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_45 +; NONEON-NOSVE-NEXT: .LBB3_13: // %else24 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_46 +; NONEON-NOSVE-NEXT: .LBB3_14: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_47 +; NONEON-NOSVE-NEXT: .LBB3_15: // %else28 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_48 +; NONEON-NOSVE-NEXT: .LBB3_16: // %else30 +; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_49 +; NONEON-NOSVE-NEXT: .LBB3_17: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_50 +; NONEON-NOSVE-NEXT: .LBB3_18: // %else34 +; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_51 +; NONEON-NOSVE-NEXT: .LBB3_19: // %else36 +; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_52 +; NONEON-NOSVE-NEXT: .LBB3_20: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_53 +; NONEON-NOSVE-NEXT: .LBB3_21: // %else40 +; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_54 +; NONEON-NOSVE-NEXT: .LBB3_22: // %else42 +; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_55 +; NONEON-NOSVE-NEXT: .LBB3_23: // %else44 +; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_56 +; NONEON-NOSVE-NEXT: .LBB3_24: // %else46 +; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_57 +; NONEON-NOSVE-NEXT: .LBB3_25: // %else48 +; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_58 +; NONEON-NOSVE-NEXT: .LBB3_26: // %else50 +; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_59 +; NONEON-NOSVE-NEXT: .LBB3_27: // %else52 +; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_60 +; NONEON-NOSVE-NEXT: .LBB3_28: // %else54 +; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_61 +; NONEON-NOSVE-NEXT: .LBB3_29: // %else56 +; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_62 +; NONEON-NOSVE-NEXT: .LBB3_30: // %else58 +; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_63 +; NONEON-NOSVE-NEXT: .LBB3_31: // %else60 +; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_64 +; NONEON-NOSVE-NEXT: .LBB3_32: // %else62 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB3_33: // %cond.store +; NONEON-NOSVE-NEXT: strb wzr, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_2 +; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.store1 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #1] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB3_3 +; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.store3 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #2] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_4 +; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.store5 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #3] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_5 +; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.store7 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #4] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_6 +; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.store9 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #5] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_7 +; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.store11 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #6] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_8 +; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.store13 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #7] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_9 +; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.store15 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #8] +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_10 +; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.store17 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #9] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_11 +; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.store19 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #10] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_12 +; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.store21 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #11] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_13 +; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.store23 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #12] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_14 +; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.store25 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #13] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_15 +; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.store27 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #14] +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_16 +; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.store29 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #15] +; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_17 +; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.store31 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #16] +; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_18 +; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.store33 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #17] +; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_19 +; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.store35 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #18] +; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_20 +; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.store37 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #19] +; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_21 +; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.store39 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #20] +; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_22 +; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.store41 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #21] +; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_23 +; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.store43 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #22] +; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_24 +; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.store45 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #23] +; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_25 +; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.store47 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #24] +; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_26 +; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.store49 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #25] +; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_27 +; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.store51 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #26] +; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_28 +; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.store53 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #27] +; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_29 +; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.store55 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #28] +; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_30 +; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.store57 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #29] +; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_31 +; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.store59 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #30] +; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_32 +; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.store61 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #31] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask) ret void } @@ -154,6 +568,29 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) { ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI4_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] +; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addp v0.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB4_3 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB4_4 +; NONEON-NOSVE-NEXT: .LBB4_2: // %else2 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB4_3: // %cond.store +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB4_2 +; NONEON-NOSVE-NEXT: .LBB4_4: // %cond.store1 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #2] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v2f16(<2 x half> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask) ret void } @@ -169,6 +606,41 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) { ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI5_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI5_0] +; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB5_5 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB5_6 +; NONEON-NOSVE-NEXT: .LBB5_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB5_7 +; NONEON-NOSVE-NEXT: .LBB5_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB5_8 +; NONEON-NOSVE-NEXT: .LBB5_4: // %else6 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB5_5: // %cond.store +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB5_2 +; NONEON-NOSVE-NEXT: .LBB5_6: // %cond.store1 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #2] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB5_3 +; NONEON-NOSVE-NEXT: .LBB5_7: // %cond.store3 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #4] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB5_4 +; NONEON-NOSVE-NEXT: .LBB5_8: // %cond.store5 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #6] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void } @@ -185,6 +657,65 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) { ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI6_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI6_0] +; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv b0, v0.8b +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB6_9 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB6_10 +; NONEON-NOSVE-NEXT: .LBB6_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB6_11 +; NONEON-NOSVE-NEXT: .LBB6_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB6_12 +; NONEON-NOSVE-NEXT: .LBB6_4: // %else6 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB6_13 +; NONEON-NOSVE-NEXT: .LBB6_5: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB6_14 +; NONEON-NOSVE-NEXT: .LBB6_6: // %else10 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB6_15 +; NONEON-NOSVE-NEXT: .LBB6_7: // %else12 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB6_16 +; NONEON-NOSVE-NEXT: .LBB6_8: // %else14 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB6_9: // %cond.store +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB6_2 +; NONEON-NOSVE-NEXT: .LBB6_10: // %cond.store1 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #2] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB6_3 +; NONEON-NOSVE-NEXT: .LBB6_11: // %cond.store3 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #4] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB6_4 +; NONEON-NOSVE-NEXT: .LBB6_12: // %cond.store5 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #6] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB6_5 +; NONEON-NOSVE-NEXT: .LBB6_13: // %cond.store7 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #8] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB6_6 +; NONEON-NOSVE-NEXT: .LBB6_14: // %cond.store9 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #10] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB6_7 +; NONEON-NOSVE-NEXT: .LBB6_15: // %cond.store11 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #12] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB6_8 +; NONEON-NOSVE-NEXT: .LBB6_16: // %cond.store13 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #14] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void } @@ -209,6 +740,115 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; CHECK-NEXT: st1h { z1.h }, p1, [x0, x8, lsl #1] ; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI7_0 +; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] +; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 +; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: addv h0, v0.8h +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB7_17 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_18 +; NONEON-NOSVE-NEXT: .LBB7_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB7_19 +; NONEON-NOSVE-NEXT: .LBB7_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB7_20 +; NONEON-NOSVE-NEXT: .LBB7_4: // %else6 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB7_21 +; NONEON-NOSVE-NEXT: .LBB7_5: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB7_22 +; NONEON-NOSVE-NEXT: .LBB7_6: // %else10 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB7_23 +; NONEON-NOSVE-NEXT: .LBB7_7: // %else12 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB7_24 +; NONEON-NOSVE-NEXT: .LBB7_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB7_25 +; NONEON-NOSVE-NEXT: .LBB7_9: // %else16 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB7_26 +; NONEON-NOSVE-NEXT: .LBB7_10: // %else18 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB7_27 +; NONEON-NOSVE-NEXT: .LBB7_11: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB7_28 +; NONEON-NOSVE-NEXT: .LBB7_12: // %else22 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB7_29 +; NONEON-NOSVE-NEXT: .LBB7_13: // %else24 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB7_30 +; NONEON-NOSVE-NEXT: .LBB7_14: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB7_31 +; NONEON-NOSVE-NEXT: .LBB7_15: // %else28 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_32 +; NONEON-NOSVE-NEXT: .LBB7_16: // %else30 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB7_17: // %cond.store +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB7_2 +; NONEON-NOSVE-NEXT: .LBB7_18: // %cond.store1 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #2] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB7_3 +; NONEON-NOSVE-NEXT: .LBB7_19: // %cond.store3 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #4] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB7_4 +; NONEON-NOSVE-NEXT: .LBB7_20: // %cond.store5 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #6] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB7_5 +; NONEON-NOSVE-NEXT: .LBB7_21: // %cond.store7 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #8] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB7_6 +; NONEON-NOSVE-NEXT: .LBB7_22: // %cond.store9 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #10] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB7_7 +; NONEON-NOSVE-NEXT: .LBB7_23: // %cond.store11 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #12] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB7_8 +; NONEON-NOSVE-NEXT: .LBB7_24: // %cond.store13 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #14] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB7_9 +; NONEON-NOSVE-NEXT: .LBB7_25: // %cond.store15 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #16] +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB7_10 +; NONEON-NOSVE-NEXT: .LBB7_26: // %cond.store17 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #18] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB7_11 +; NONEON-NOSVE-NEXT: .LBB7_27: // %cond.store19 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #20] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB7_12 +; NONEON-NOSVE-NEXT: .LBB7_28: // %cond.store21 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #22] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB7_13 +; NONEON-NOSVE-NEXT: .LBB7_29: // %cond.store23 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #24] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB7_14 +; NONEON-NOSVE-NEXT: .LBB7_30: // %cond.store25 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #26] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB7_15 +; NONEON-NOSVE-NEXT: .LBB7_31: // %cond.store27 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #28] +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB7_16 +; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.store29 +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: str h0, [x0, #30] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask) ret void } @@ -225,6 +865,37 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { ; CHECK-NEXT: mov z0.s, #0 // =0x0 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI8_0] +; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB8_5 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB8_6 +; NONEON-NOSVE-NEXT: .LBB8_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB8_7 +; NONEON-NOSVE-NEXT: .LBB8_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB8_8 +; NONEON-NOSVE-NEXT: .LBB8_4: // %else6 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB8_5: // %cond.store +; NONEON-NOSVE-NEXT: str wzr, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB8_2 +; NONEON-NOSVE-NEXT: .LBB8_6: // %cond.store1 +; NONEON-NOSVE-NEXT: str wzr, [x0, #4] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB8_3 +; NONEON-NOSVE-NEXT: .LBB8_7: // %cond.store3 +; NONEON-NOSVE-NEXT: str wzr, [x0, #8] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB8_4 +; NONEON-NOSVE-NEXT: .LBB8_8: // %cond.store5 +; NONEON-NOSVE-NEXT: str wzr, [x0, #12] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void } @@ -275,6 +946,57 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI9_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI9_0] +; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv b0, v0.8b +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB9_9 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB9_10 +; NONEON-NOSVE-NEXT: .LBB9_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB9_11 +; NONEON-NOSVE-NEXT: .LBB9_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB9_12 +; NONEON-NOSVE-NEXT: .LBB9_4: // %else6 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB9_13 +; NONEON-NOSVE-NEXT: .LBB9_5: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB9_14 +; NONEON-NOSVE-NEXT: .LBB9_6: // %else10 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB9_15 +; NONEON-NOSVE-NEXT: .LBB9_7: // %else12 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB9_16 +; NONEON-NOSVE-NEXT: .LBB9_8: // %else14 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB9_9: // %cond.store +; NONEON-NOSVE-NEXT: str wzr, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB9_2 +; NONEON-NOSVE-NEXT: .LBB9_10: // %cond.store1 +; NONEON-NOSVE-NEXT: str wzr, [x0, #4] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB9_3 +; NONEON-NOSVE-NEXT: .LBB9_11: // %cond.store3 +; NONEON-NOSVE-NEXT: str wzr, [x0, #8] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB9_4 +; NONEON-NOSVE-NEXT: .LBB9_12: // %cond.store5 +; NONEON-NOSVE-NEXT: str wzr, [x0, #12] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB9_5 +; NONEON-NOSVE-NEXT: .LBB9_13: // %cond.store7 +; NONEON-NOSVE-NEXT: str wzr, [x0, #16] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB9_6 +; NONEON-NOSVE-NEXT: .LBB9_14: // %cond.store9 +; NONEON-NOSVE-NEXT: str wzr, [x0, #20] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB9_7 +; NONEON-NOSVE-NEXT: .LBB9_15: // %cond.store11 +; NONEON-NOSVE-NEXT: str wzr, [x0, #24] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB9_8 +; NONEON-NOSVE-NEXT: .LBB9_16: // %cond.store13 +; NONEON-NOSVE-NEXT: str wzr, [x0, #28] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void } @@ -291,6 +1013,27 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) { ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI10_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] +; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addp v0.2s, v0.2s, v0.2s +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB10_3 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB10_4 +; NONEON-NOSVE-NEXT: .LBB10_2: // %else2 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB10_3: // %cond.store +; NONEON-NOSVE-NEXT: str xzr, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB10_2 +; NONEON-NOSVE-NEXT: .LBB10_4: // %cond.store1 +; NONEON-NOSVE-NEXT: str xzr, [x0, #8] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask) ret void } @@ -315,6 +1058,37 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; CHECK-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: masked_store_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI11_0 +; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI11_0] +; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 +; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: addv h0, v0.4h +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB11_5 +; NONEON-NOSVE-NEXT: // %bb.1: // %else +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB11_6 +; NONEON-NOSVE-NEXT: .LBB11_2: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB11_7 +; NONEON-NOSVE-NEXT: .LBB11_3: // %else4 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB11_8 +; NONEON-NOSVE-NEXT: .LBB11_4: // %else6 +; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB11_5: // %cond.store +; NONEON-NOSVE-NEXT: str xzr, [x0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB11_2 +; NONEON-NOSVE-NEXT: .LBB11_6: // %cond.store1 +; NONEON-NOSVE-NEXT: str xzr, [x0, #8] +; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB11_3 +; NONEON-NOSVE-NEXT: .LBB11_7: // %cond.store3 +; NONEON-NOSVE-NEXT: str xzr, [x0, #16] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB11_4 +; NONEON-NOSVE-NEXT: .LBB11_8: // %cond.store5 +; NONEON-NOSVE-NEXT: str xzr, [x0, #24] +; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll index aef446a90df656..6a6b47e815ac16 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -14,6 +15,15 @@ define void @add_v4i8(ptr %a, ptr %b) { ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: st1b { z0.h }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: ldr s1, [x1] +; NONEON-NOSVE-NEXT: uaddl v0.8h, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str s0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i8>, ptr %a %op2 = load <4 x i8>, ptr %b %res = add <4 x i8> %op1, %op2 @@ -29,6 +39,14 @@ define void @add_v8i8(ptr %a, ptr %b) { ; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ldr d1, [x1] +; NONEON-NOSVE-NEXT: add v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b %res = add <8 x i8> %op1, %op2 @@ -44,6 +62,14 @@ define void @add_v16i8(ptr %a, ptr %b) { ; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i8>, ptr %a %op2 = load <16 x i8>, ptr %b %res = add <16 x i8> %op1, %op2 @@ -60,6 +86,15 @@ define void @add_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: add z1.b, z2.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: add v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %res = add <32 x i8> %op1, %op2 @@ -76,6 +111,23 @@ define void @add_v2i16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldrh w8, [x0] +; NONEON-NOSVE-NEXT: ldrh w9, [x1] +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: fmov s1, w9 +; NONEON-NOSVE-NEXT: add x8, x0, #2 +; NONEON-NOSVE-NEXT: add x9, x1, #2 +; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] +; NONEON-NOSVE-NEXT: ld1 { v1.h }[2], [x9] +; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: mov w8, v0.s[1] +; NONEON-NOSVE-NEXT: fmov w9, s0 +; NONEON-NOSVE-NEXT: strh w9, [x0] +; NONEON-NOSVE-NEXT: strh w8, [x0, #2] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i16>, ptr %a %op2 = load <2 x i16>, ptr %b %res = add <2 x i16> %op1, %op2 @@ -91,6 +143,14 @@ define void @add_v4i16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ldr d1, [x1] +; NONEON-NOSVE-NEXT: add v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %op2 = load <4 x i16>, ptr %b %res = add <4 x i16> %op1, %op2 @@ -106,6 +166,14 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %op2 = load <8 x i16>, ptr %b %res = add <8 x i16> %op1, %op2 @@ -122,6 +190,15 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: add z1.h, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: add_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %res = add <16 x i16> %op1, %op2 @@ -137,6 +214,13 @@ define void @abs_v2i32(ptr %a) { ; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: abs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i32>, ptr %a %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) store <2 x i32> %res, ptr %a @@ -151,6 +235,13 @@ define void @abs_v4i32(ptr %a) { ; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) store <4 x i32> %res, ptr %a @@ -166,6 +257,14 @@ define void @abs_v8i32(ptr %a) { ; CHECK-NEXT: abs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: abs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) store <8 x i32> %res, ptr %a @@ -180,6 +279,13 @@ define void @abs_v2i64(ptr %a) { ; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) store <2 x i64> %res, ptr %a @@ -195,6 +301,14 @@ define void @abs_v4i64(ptr %a) { ; CHECK-NEXT: abs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: abs_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: abs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) store <4 x i64> %res, ptr %a @@ -211,6 +325,17 @@ define void @fadd_v2f16(ptr %a, ptr %b) { ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: ldr s1, [x1] +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str s0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x half>, ptr %a %op2 = load <2 x half>, ptr %b %res = fadd <2 x half> %op1, %op2 @@ -227,6 +352,17 @@ define void @fadd_v4f16(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ldr d1, [x1] +; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %op2 = load <4 x half>, ptr %b %res = fadd <4 x half> %op1, %op2 @@ -243,6 +379,21 @@ define void @fadd_v8f16(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fadd v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v2.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %op2 = load <8 x half>, ptr %b %res = fadd <8 x half> %op1, %op2 @@ -261,6 +412,29 @@ define void @fadd_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h +; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h +; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h +; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h +; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h +; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h +; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h +; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h +; NONEON-NOSVE-NEXT: fadd v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: fadd v5.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fadd v2.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s +; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s +; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s +; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s +; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %res = fadd <16 x half> %op1, %op2 @@ -277,6 +451,14 @@ define void @fadd_v2f32(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ldr d1, [x1] +; NONEON-NOSVE-NEXT: fadd v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x float>, ptr %a %op2 = load <2 x float>, ptr %b %res = fadd <2 x float> %op1, %op2 @@ -293,6 +475,14 @@ define void @fadd_v4f32(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %op2 = load <4 x float>, ptr %b %res = fadd <4 x float> %op1, %op2 @@ -311,6 +501,15 @@ define void @fadd_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: fadd v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %res = fadd <8 x float> %op1, %op2 @@ -327,6 +526,14 @@ define void @fadd_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: fadd v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <2 x double>, ptr %a %op2 = load <2 x double>, ptr %b %res = fadd <2 x double> %op1, %op2 @@ -345,6 +552,15 @@ define void @fadd_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fadd_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: fadd v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %res = fadd <4 x double> %op1, %op2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll index 6d91253caae58f..03bb899c517b4e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -15,6 +16,14 @@ define void @test_revbv16i16(ptr %a) { ; CHECK-NEXT: revb z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revbv16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rev16 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> store <32 x i8> %tmp2, ptr %a @@ -31,6 +40,14 @@ define void @test_revbv8i32(ptr %a) { ; CHECK-NEXT: revb z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revbv8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rev32 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> store <32 x i8> %tmp2, ptr %a @@ -47,6 +64,14 @@ define void @test_revbv4i64(ptr %a) { ; CHECK-NEXT: revb z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revbv4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rev64 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> store <32 x i8> %tmp2, ptr %a @@ -63,6 +88,14 @@ define void @test_revhv8i32(ptr %a) { ; CHECK-NEXT: revh z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revhv8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev32 v0.8h, v0.8h +; NONEON-NOSVE-NEXT: rev32 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> store <16 x i16> %tmp2, ptr %a @@ -79,6 +112,14 @@ define void @test_revhv8f32(ptr %a) { ; CHECK-NEXT: revh z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revhv8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev32 v0.8h, v0.8h +; NONEON-NOSVE-NEXT: rev32 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x half>, ptr %a %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> store <16 x half> %tmp2, ptr %a @@ -95,6 +136,14 @@ define void @test_revhv4i64(ptr %a) { ; CHECK-NEXT: revh z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revhv4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev64 v0.8h, v0.8h +; NONEON-NOSVE-NEXT: rev64 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> store <16 x i16> %tmp2, ptr %a @@ -111,6 +160,14 @@ define void @test_revwv4i64(ptr %a) { ; CHECK-NEXT: revw z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revwv4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s +; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> store <8 x i32> %tmp2, ptr %a @@ -127,6 +184,14 @@ define void @test_revwv4f64(ptr %a) { ; CHECK-NEXT: revw z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revwv4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s +; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x float>, ptr %a %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> store <8 x float> %tmp2, ptr %a @@ -141,6 +206,12 @@ define <16 x i8> @test_revv16i8(ptr %a) { ; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revv16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i8>, ptr %a %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 @@ -156,6 +227,14 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: revw z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revwv8i32v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s +; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = load <8 x i32>, ptr %b %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> @@ -176,6 +255,18 @@ define void @test_revhv32i16(ptr %a) { ; CHECK-NEXT: stp q0, q1, [x0, #32] ; CHECK-NEXT: stp q2, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revhv32i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: rev64 v0.8h, v0.8h +; NONEON-NOSVE-NEXT: rev64 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: rev64 v2.8h, v2.8h +; NONEON-NOSVE-NEXT: rev64 v3.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i16>, ptr %a %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> store <32 x i16> %tmp2, ptr %a @@ -191,6 +282,14 @@ define void @test_rev_elts_fail(ptr %a) { ; CHECK-NEXT: tbl z0.d, { z2.d }, z0.d ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_rev_elts_fail: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> store <4 x i64> %tmp2, ptr %a @@ -208,6 +307,15 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 { ; CHECK-NEXT: revd z1.q, p0/m, z1.q ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revdv4i64_sve2p1: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ptrue p0.d, vl2 +; NONEON-NOSVE-NEXT: revd z0.q, p0/m, z0.q +; NONEON-NOSVE-NEXT: revd z1.q, p0/m, z1.q +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> store <4 x i64> %tmp2, ptr %a @@ -223,6 +331,15 @@ define void @test_revdv4f64_sve2p1(ptr %a) #1 { ; CHECK-NEXT: revd z1.q, p0/m, z1.q ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revdv4f64_sve2p1: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ptrue p0.d +; NONEON-NOSVE-NEXT: revd z0.q, p0/m, z0.q +; NONEON-NOSVE-NEXT: revd z1.q, p0/m, z1.q +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> store <4 x double> %tmp2, ptr %a @@ -238,6 +355,16 @@ define void @test_revv8i32(ptr %a) { ; CHECK-NEXT: tbl z0.s, { z2.s }, z0.s ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_revv8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s +; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> store <8 x i32> %tmp2, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index 8808ad9a23d7c5..f254a1f9098f2d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -68,6 +69,18 @@ define void @zip1_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: str q1, [x0, #16] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip1_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: zip2 v2.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <32 x i8>, ptr %a %tmp2 = load volatile <32 x i8>, ptr %b %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> @@ -196,6 +209,28 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip_v32i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q4, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q5, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q6, q2, [x1, #32] +; NONEON-NOSVE-NEXT: ldp q7, q3, [x1] +; NONEON-NOSVE-NEXT: zip1 v17.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: zip2 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: zip1 v16.8h, v1.8h, v3.8h +; NONEON-NOSVE-NEXT: zip2 v1.8h, v1.8h, v3.8h +; NONEON-NOSVE-NEXT: zip1 v2.8h, v5.8h, v7.8h +; NONEON-NOSVE-NEXT: zip1 v3.8h, v4.8h, v6.8h +; NONEON-NOSVE-NEXT: zip2 v5.8h, v5.8h, v7.8h +; NONEON-NOSVE-NEXT: zip2 v4.8h, v4.8h, v6.8h +; NONEON-NOSVE-NEXT: add v6.8h, v16.8h, v17.8h +; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: add v2.8h, v5.8h, v4.8h +; NONEON-NOSVE-NEXT: stp q6, q0, [x0, #32] +; NONEON-NOSVE-NEXT: stp q1, q2, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i16>, ptr %a %tmp2 = load <32 x i16>, ptr %b %tmp3 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> @@ -244,6 +279,18 @@ define void @zip1_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: str q1, [x0, #16] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip1_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: zip2 v2.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: zip1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <16 x i16>, ptr %a %tmp2 = load volatile <16 x i16>, ptr %b %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> @@ -276,6 +323,18 @@ define void @zip1_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: str q1, [x0, #16] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip1_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: zip2 v2.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: str q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = load volatile <8 x i32>, ptr %b %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> @@ -298,6 +357,19 @@ define void @zip_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: zip1 v4.2d, v1.2d, v3.2d +; NONEON-NOSVE-NEXT: zip1 v5.2d, v0.2d, v2.2d +; NONEON-NOSVE-NEXT: zip2 v1.2d, v1.2d, v3.2d +; NONEON-NOSVE-NEXT: zip2 v0.2d, v0.2d, v2.2d +; NONEON-NOSVE-NEXT: fadd v2.2d, v4.2d, v5.2d +; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: stp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> @@ -330,6 +402,16 @@ define void @zip_v4i32(ptr %a, ptr %b) { ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: zip1 v2.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: zip2 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i32>, ptr %a %tmp2 = load <4 x i32>, ptr %b %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> @@ -351,6 +433,16 @@ define void @zip1_v8i32_undef(ptr %a) { ; CHECK-NEXT: str q1, [x0, #16] ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip1_v8i32_undef: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: zip2 v1.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> store volatile <8 x i32> %tmp2, ptr %a @@ -370,6 +462,19 @@ define void @trn_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: add z1.b, z1.b, z2.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trn_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: trn1 v4.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: trn2 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: trn1 v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: trn2 v2.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: add v0.16b, v4.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = load <32 x i8>, ptr %b %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> @@ -392,6 +497,19 @@ define void @trn_v8i16(ptr %a, ptr %b) { ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trn_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI8_1 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI8_0] +; NONEON-NOSVE-NEXT: ldr q2, [x9, :lo12:.LCPI8_1] +; NONEON-NOSVE-NEXT: tbl v0.16b, { v1.16b }, v0.16b +; NONEON-NOSVE-NEXT: tbl v1.16b, { v1.16b }, v2.16b +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i16>, ptr %a %tmp2 = load <8 x i16>, ptr %b %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> @@ -414,6 +532,19 @@ define void @trn_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: add z1.h, z1.h, z2.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trn_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: trn1 v4.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: trn2 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: trn1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: trn2 v2.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: add v0.8h, v4.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v2.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = load <16 x i16>, ptr %b %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> @@ -436,6 +567,19 @@ define void @trn_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: add z1.s, z1.s, z2.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trn_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: zip1 v4.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: trn2 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: trn1 v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: trn2 v2.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: add v0.4s, v4.4s, v0.4s +; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v2.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = load <8 x i32>, ptr %b %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> @@ -459,6 +603,19 @@ define void @trn_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z2.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trn_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] +; NONEON-NOSVE-NEXT: zip1 v4.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: zip2 v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: zip1 v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: zip2 v2.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: fadd v0.2d, v4.2d, v0.2d +; NONEON-NOSVE-NEXT: fadd v1.2d, v1.2d, v2.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> @@ -479,6 +636,16 @@ define void @trn_v4f32(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trn_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: trn1 v2.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: trn2 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: fadd v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x float>, ptr %a %tmp2 = load <4 x float>, ptr %b %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> @@ -500,6 +667,18 @@ define void @trn_v8i32_undef(ptr %a) { ; CHECK-NEXT: add z1.s, z3.s, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trn_v8i32_undef: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: trn1 v2.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: trn2 v0.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: trn1 v3.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: trn2 v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: add v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -571,6 +750,18 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{ ; CHECK-NEXT: str q1, [x0, #16] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip2_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: zip2 v2.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <32 x i8>, ptr %a %tmp2 = load volatile <32 x i8>, ptr %b %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> @@ -617,6 +808,18 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{ ; CHECK-NEXT: str q1, [x0, #16] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip2_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: zip2 v2.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: zip1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <16 x i16>, ptr %a %tmp2 = load volatile <16 x i16>, ptr %b %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> @@ -649,6 +852,18 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{ ; CHECK-NEXT: str q1, [x0, #16] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip2_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: zip2 v2.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: str q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = load volatile <8 x i32>, ptr %b %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> @@ -668,6 +883,16 @@ define void @zip2_v8i32_undef(ptr %a) #0{ ; CHECK-NEXT: str q1, [x0, #16] ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip2_v8i32_undef: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: zip2 v1.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> store volatile <8 x i32> %tmp2, ptr %a @@ -869,6 +1094,19 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{ ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uzp_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: uzp1 v4.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: uzp2 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b +; NONEON-NOSVE-NEXT: uzp2 v2.16b, v3.16b, v2.16b +; NONEON-NOSVE-NEXT: add v0.16b, v4.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = load <32 x i8>, ptr %b %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> @@ -891,6 +1129,17 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{ ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uzp_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ext v1.8b, v0.8b, v0.8b, #6 +; NONEON-NOSVE-NEXT: ext v2.8b, v0.8b, v0.8b, #2 +; NONEON-NOSVE-NEXT: trn1 v1.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: zip1 v0.4h, v2.4h, v0.4h +; NONEON-NOSVE-NEXT: add v0.4h, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i16>, ptr %a %tmp2 = load <4 x i16>, ptr %b %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> @@ -1008,6 +1257,19 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uzp_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: uzp1 v4.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: uzp2 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v2.8h +; NONEON-NOSVE-NEXT: uzp2 v2.8h, v3.8h, v2.8h +; NONEON-NOSVE-NEXT: add v0.8h, v4.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v2.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = load <16 x i16>, ptr %b %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> @@ -1047,6 +1309,19 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{ ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uzp_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: uzp1 v4.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp2 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v1.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: uzp2 v2.4s, v3.4s, v0.4s +; NONEON-NOSVE-NEXT: fadd v0.4s, v4.4s, v0.4s +; NONEON-NOSVE-NEXT: fadd v1.4s, v1.4s, v2.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x float>, ptr %a %tmp2 = load <8 x float>, ptr %b %tmp3 = shufflevector <8 x float> %tmp1, <8 x float> %tmp2, <8 x i32> @@ -1069,6 +1344,19 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{ ; CHECK-NEXT: add z1.d, z1.d, z2.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uzp_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: zip1 v4.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: zip2 v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: zip1 v1.2d, v3.2d, v2.2d +; NONEON-NOSVE-NEXT: zip2 v2.2d, v3.2d, v2.2d +; NONEON-NOSVE-NEXT: add v0.2d, v4.2d, v0.2d +; NONEON-NOSVE-NEXT: add v1.2d, v1.2d, v2.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = load <4 x i64>, ptr %b %tmp3 = shufflevector <4 x i64> %tmp1, <4 x i64> %tmp2, <4 x i32> @@ -1136,6 +1424,16 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{ ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uzp_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: add v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i16>, ptr %a %tmp2 = load <8 x i16>, ptr %b %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> @@ -1174,6 +1472,15 @@ define void @uzp_v8i32_undef(ptr %a) #0{ ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uzp_v8i32_undef: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: uzp1 v2.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp2 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -1197,6 +1504,19 @@ define void @zip_vscale2_4(ptr %a, ptr %b) { ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: zip_vscale2_4: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: zip1 v4.2d, v1.2d, v3.2d +; NONEON-NOSVE-NEXT: zip1 v5.2d, v0.2d, v2.2d +; NONEON-NOSVE-NEXT: zip2 v1.2d, v1.2d, v3.2d +; NONEON-NOSVE-NEXT: zip2 v0.2d, v0.2d, v2.2d +; NONEON-NOSVE-NEXT: fadd v2.2d, v4.2d, v5.2d +; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: stp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll index 8039bd096bcb89..41d2cb8a2c7564 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -35,6 +36,23 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ptest_v16i1: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b +; NONEON-NOSVE-NEXT: umaxv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w0, w8, #0x1 +; NONEON-NOSVE-NEXT: ret %v0 = bitcast ptr %a to ptr %v1 = load <16 x float>, ptr %v0, align 4 %v2 = fcmp une <16 x float> %v1, zeroinitializer @@ -92,6 +110,33 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) { ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ptest_or_v16i1: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x1, #32] +; NONEON-NOSVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NONEON-NOSVE-NEXT: ldp q6, q7, [x1] +; NONEON-NOSVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: fcmeq v7.4s, v7.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v5.8h, v4.8h +; NONEON-NOSVE-NEXT: uzp1 v3.8h, v6.8h, v7.8h +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b +; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b +; NONEON-NOSVE-NEXT: orn v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: umaxv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w0, w8, #0x1 +; NONEON-NOSVE-NEXT: ret %v0 = bitcast ptr %a to ptr %v1 = load <16 x float>, ptr %v0, align 4 %v2 = fcmp une <16 x float> %v1, zeroinitializer @@ -159,6 +204,33 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) { ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: ptest_and_v16i1: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x1, #32] +; NONEON-NOSVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NONEON-NOSVE-NEXT: ldp q6, q7, [x1] +; NONEON-NOSVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: fcmeq v7.4s, v7.4s, #0.0 +; NONEON-NOSVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v5.8h, v4.8h +; NONEON-NOSVE-NEXT: uzp1 v3.8h, v6.8h, v7.8h +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b +; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b +; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: uminv b0, v0.16b +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w0, w8, #0x1 +; NONEON-NOSVE-NEXT: ret %v0 = bitcast ptr %a to ptr %v1 = load <16 x float>, ptr %v0, align 4 %v2 = fcmp une <16 x float> %v1, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll index 726fd28c90ae22..5626f77c684f22 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -18,6 +19,13 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) { ; CHECK-NEXT: lsr z0.h, z0.h, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev16 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ushr v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %op) ret <4 x i8> %res } @@ -30,6 +38,11 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) { ; CHECK-NEXT: rbit z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op) ret <8 x i8> %res } @@ -42,6 +55,11 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) { ; CHECK-NEXT: rbit z0.b, p0/m, z0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op) ret <16 x i8> %res } @@ -55,6 +73,14 @@ define void @bitreverse_v32i8(ptr %a) { ; CHECK-NEXT: rbit z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op) store <32 x i8> %res, ptr %a @@ -70,6 +96,13 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) { ; CHECK-NEXT: lsr z0.s, z0.s, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %op) ret <2 x i16> %res } @@ -82,6 +115,12 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) { ; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev16 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op) ret <4 x i16> %res } @@ -94,6 +133,12 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) { ; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op) ret <8 x i16> %res } @@ -107,6 +152,16 @@ define void @bitreverse_v16i16(ptr %a) { ; CHECK-NEXT: rbit z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rev16 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op) store <16 x i16> %res, ptr %a @@ -121,6 +176,12 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) { ; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op) ret <2 x i32> %res } @@ -133,6 +194,12 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) { ; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op) ret <4 x i32> %res } @@ -146,6 +213,16 @@ define void @bitreverse_v8i32(ptr %a) { ; CHECK-NEXT: rbit z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rev32 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op) store <8 x i32> %res, ptr %a @@ -160,6 +237,12 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) { ; CHECK-NEXT: rbit z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev64 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op) ret <1 x i64> %res } @@ -172,6 +255,12 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) { ; CHECK-NEXT: rbit z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op) ret <2 x i64> %res } @@ -185,6 +274,16 @@ define void @bitreverse_v4i64(ptr %a) { ; CHECK-NEXT: rbit z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bitreverse_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rev64 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op) store <4 x i64> %res, ptr %a @@ -204,6 +303,12 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) { ; CHECK-NEXT: lsr z0.s, z0.s, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op) ret <2 x i16> %res } @@ -216,6 +321,11 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) { ; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev16 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op) ret <4 x i16> %res } @@ -228,6 +338,11 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) { ; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op) ret <8 x i16> %res } @@ -241,6 +356,14 @@ define void @bswap_v16i16(ptr %a) { ; CHECK-NEXT: revb z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rev16 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op) store <16 x i16> %res, ptr %a @@ -255,6 +378,11 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) { ; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op) ret <2 x i32> %res } @@ -267,6 +395,11 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) { ; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op) ret <4 x i32> %res } @@ -280,6 +413,14 @@ define void @bswap_v8i32(ptr %a) { ; CHECK-NEXT: revb z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rev32 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op) store <8 x i32> %res, ptr %a @@ -294,6 +435,11 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) { ; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev64 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op) ret <1 x i64> %res } @@ -306,6 +452,11 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) { ; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op) ret <2 x i64> %res } @@ -319,6 +470,14 @@ define void @bswap_v4i64(ptr %a) { ; CHECK-NEXT: revb z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: bswap_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: rev64 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op) store <4 x i64> %res, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll index c022bf85e67e93..55f4f5bae641e5 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -14,6 +15,19 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) { ; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v1.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff +; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 +; NONEON-NOSVE-NEXT: ushr v1.4h, v1.4h, #7 +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: usra v0.4h, v1.4h, #3 +; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i8> %op1, shufflevector (<4 x i8> insertelement (<4 x i8> poison, i8 32, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer) ret <4 x i8> %res } @@ -26,6 +40,13 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) { ; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmlt v1.8b, v0.8b, #0 +; NONEON-NOSVE-NEXT: usra v0.8b, v1.8b, #3 +; NONEON-NOSVE-NEXT: sshr v0.8b, v0.8b, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer) ret <8 x i8> %res } @@ -38,6 +59,13 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) { ; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmlt v1.16b, v0.16b, #0 +; NONEON-NOSVE-NEXT: usra v0.16b, v1.16b, #3 +; NONEON-NOSVE-NEXT: sshr v0.16b, v0.16b, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer) ret <16 x i8> %res } @@ -51,6 +79,18 @@ define void @sdiv_v32i8(ptr %a) { ; CHECK-NEXT: asrd z1.b, p0/m, z1.b, #5 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmlt v2.16b, v0.16b, #0 +; NONEON-NOSVE-NEXT: cmlt v3.16b, v1.16b, #0 +; NONEON-NOSVE-NEXT: usra v0.16b, v2.16b, #3 +; NONEON-NOSVE-NEXT: usra v1.16b, v3.16b, #3 +; NONEON-NOSVE-NEXT: sshr v0.16b, v0.16b, #5 +; NONEON-NOSVE-NEXT: sshr v1.16b, v1.16b, #5 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer) store <32 x i8> %res, ptr %a @@ -66,6 +106,20 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) { ; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: shl v1.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f +; NONEON-NOSVE-NEXT: dup v2.2s, w8 +; NONEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 +; NONEON-NOSVE-NEXT: ushr v1.2s, v1.2s, #26 +; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i16> %op1, shufflevector (<2 x i16> insertelement (<2 x i16> poison, i16 32, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer) ret <2 x i16> %res } @@ -78,6 +132,13 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) { ; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmlt v1.4h, v0.4h, #0 +; NONEON-NOSVE-NEXT: usra v0.4h, v1.4h, #11 +; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer) ret <4 x i16> %res } @@ -90,6 +151,13 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) { ; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmlt v1.8h, v0.8h, #0 +; NONEON-NOSVE-NEXT: usra v0.8h, v1.8h, #11 +; NONEON-NOSVE-NEXT: sshr v0.8h, v0.8h, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer) ret <8 x i16> %res } @@ -103,6 +171,18 @@ define void @sdiv_v16i16(ptr %a) { ; CHECK-NEXT: asrd z1.h, p0/m, z1.h, #5 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmlt v2.8h, v0.8h, #0 +; NONEON-NOSVE-NEXT: cmlt v3.8h, v1.8h, #0 +; NONEON-NOSVE-NEXT: usra v0.8h, v2.8h, #11 +; NONEON-NOSVE-NEXT: usra v1.8h, v3.8h, #11 +; NONEON-NOSVE-NEXT: sshr v0.8h, v0.8h, #5 +; NONEON-NOSVE-NEXT: sshr v1.8h, v1.8h, #5 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer) store <16 x i16> %res, ptr %a @@ -117,6 +197,13 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) { ; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmlt v1.2s, v0.2s, #0 +; NONEON-NOSVE-NEXT: usra v0.2s, v1.2s, #27 +; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer) ret <2 x i32> %res } @@ -129,6 +216,13 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) { ; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmlt v1.4s, v0.4s, #0 +; NONEON-NOSVE-NEXT: usra v0.4s, v1.4s, #27 +; NONEON-NOSVE-NEXT: sshr v0.4s, v0.4s, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer) ret <4 x i32> %res } @@ -142,6 +236,18 @@ define void @sdiv_v8i32(ptr %a) { ; CHECK-NEXT: asrd z1.s, p0/m, z1.s, #5 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmlt v2.4s, v0.4s, #0 +; NONEON-NOSVE-NEXT: cmlt v3.4s, v1.4s, #0 +; NONEON-NOSVE-NEXT: usra v0.4s, v2.4s, #27 +; NONEON-NOSVE-NEXT: usra v1.4s, v3.4s, #27 +; NONEON-NOSVE-NEXT: sshr v0.4s, v0.4s, #5 +; NONEON-NOSVE-NEXT: sshr v1.4s, v1.4s, #5 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer) store <8 x i32> %res, ptr %a @@ -156,6 +262,13 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) { ; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmlt d1, d0, #0 +; NONEON-NOSVE-NEXT: usra d0, d1, #59 +; NONEON-NOSVE-NEXT: sshr d0, d0, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer) ret <1 x i64> %res } @@ -169,6 +282,13 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) { ; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: cmlt v1.2d, v0.2d, #0 +; NONEON-NOSVE-NEXT: usra v0.2d, v1.2d, #59 +; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #5 +; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer) ret <2 x i64> %res } @@ -182,6 +302,18 @@ define void @sdiv_v4i64(ptr %a) { ; CHECK-NEXT: asrd z1.d, p0/m, z1.d, #5 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sdiv_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: cmlt v2.2d, v0.2d, #0 +; NONEON-NOSVE-NEXT: cmlt v3.2d, v1.2d, #0 +; NONEON-NOSVE-NEXT: usra v0.2d, v2.2d, #59 +; NONEON-NOSVE-NEXT: usra v1.2d, v3.2d, #59 +; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #5 +; NONEON-NOSVE-NEXT: sshr v1.2d, v1.2d, #5 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer) store <4 x i64> %res, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll index 649b13fa8a1e35..e15529e1926ac7 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE @@ -15,6 +16,11 @@ define <4 x i8> @splat_v4i8(i8 %a) { ; CHECK-NEXT: mov z0.h, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.4h, w0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i8> undef, i8 %a, i64 0 %splat = shufflevector <4 x i8> %insert, <4 x i8> undef, <4 x i32> zeroinitializer ret <4 x i8> %splat @@ -26,6 +32,11 @@ define <8 x i8> @splat_v8i8(i8 %a) { ; CHECK-NEXT: mov z0.b, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.8b, w0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i8> undef, i8 %a, i64 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer ret <8 x i8> %splat @@ -37,6 +48,11 @@ define <16 x i8> @splat_v16i8(i8 %a) { ; CHECK-NEXT: mov z0.b, w0 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.16b, w0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x i8> undef, i8 %a, i64 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %splat @@ -48,6 +64,12 @@ define void @splat_v32i8(i8 %a, ptr %b) { ; CHECK-NEXT: mov z0.b, w0 ; CHECK-NEXT: stp q0, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.16b, w0 +; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <32 x i8> undef, i8 %a, i64 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer store <32 x i8> %splat, ptr %b @@ -60,6 +82,11 @@ define <2 x i16> @splat_v2i16(i16 %a) { ; CHECK-NEXT: mov z0.s, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.2s, w0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x i16> undef, i16 %a, i64 0 %splat = shufflevector <2 x i16> %insert, <2 x i16> undef, <2 x i32> zeroinitializer ret <2 x i16> %splat @@ -71,6 +98,11 @@ define <4 x i16> @splat_v4i16(i16 %a) { ; CHECK-NEXT: mov z0.h, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.4h, w0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i16> undef, i16 %a, i64 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %splat @@ -82,6 +114,11 @@ define <8 x i16> @splat_v8i16(i16 %a) { ; CHECK-NEXT: mov z0.h, w0 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.8h, w0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i16> undef, i16 %a, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %splat @@ -93,6 +130,12 @@ define void @splat_v16i16(i16 %a, ptr %b) { ; CHECK-NEXT: mov z0.h, w0 ; CHECK-NEXT: stp q0, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.8h, w0 +; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x i16> undef, i16 %a, i64 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer store <16 x i16> %splat, ptr %b @@ -105,6 +148,11 @@ define <2 x i32> @splat_v2i32(i32 %a) { ; CHECK-NEXT: mov z0.s, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.2s, w0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x i32> undef, i32 %a, i64 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer ret <2 x i32> %splat @@ -116,6 +164,11 @@ define <4 x i32> @splat_v4i32(i32 %a) { ; CHECK-NEXT: mov z0.s, w0 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.4s, w0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i32> undef, i32 %a, i64 0 %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat @@ -127,6 +180,12 @@ define void @splat_v8i32(i32 %a, ptr %b) { ; CHECK-NEXT: mov z0.s, w0 ; CHECK-NEXT: stp q0, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.4s, w0 +; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i32> undef, i32 %a, i64 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer store <8 x i32> %splat, ptr %b @@ -139,6 +198,11 @@ define <1 x i64> @splat_v1i64(i64 %a) { ; CHECK-NEXT: mov z0.d, x0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov d0, x0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <1 x i64> undef, i64 %a, i64 0 %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer ret <1 x i64> %splat @@ -150,6 +214,11 @@ define <2 x i64> @splat_v2i64(i64 %a) { ; CHECK-NEXT: mov z0.d, x0 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.2d, x0 +; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x i64> undef, i64 %a, i64 0 %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat @@ -161,6 +230,12 @@ define void @splat_v4i64(i64 %a, ptr %b) { ; CHECK-NEXT: mov z0.d, x0 ; CHECK-NEXT: stp q0, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: dup v0.2d, x0 +; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i64> undef, i64 %a, i64 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer store <4 x i64> %splat, ptr %b @@ -178,6 +253,12 @@ define <2 x half> @splat_v2f16(half %a) { ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.4h, v0.h[0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x half> undef, half %a, i64 0 %splat = shufflevector <2 x half> %insert, <2 x half> undef, <2 x i32> zeroinitializer ret <2 x half> %splat @@ -190,6 +271,12 @@ define <4 x half> @splat_v4f16(half %a) { ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.4h, v0.h[0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x half> undef, half %a, i64 0 %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer ret <4 x half> %splat @@ -202,6 +289,12 @@ define <8 x half> @splat_v8f16(half %a) { ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.8h, v0.h[0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x half> undef, half %a, i64 0 %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer ret <8 x half> %splat @@ -214,6 +307,13 @@ define void @splat_v16f16(half %a, ptr %b) { ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.8h, v0.h[0] +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x half> undef, half %a, i64 0 %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer store <16 x half> %splat, ptr %b @@ -227,6 +327,12 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) { ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $s0 killed $s0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x float> undef, float %a, i64 0 %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer ret <2 x float> %splat @@ -239,6 +345,12 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) { ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $s0 killed $s0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.4s, v0.s[0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x float> undef, float %a, i64 0 %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %splat @@ -251,6 +363,13 @@ define void @splat_v8f32(float %a, ptr %b) { ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $s0 killed $s0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.4s, v0.s[0] +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x float> undef, float %a, i64 0 %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer store <8 x float> %splat, ptr %b @@ -261,6 +380,10 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) { ; CHECK-LABEL: splat_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ret %insert = insertelement <1 x double> undef, double %a, i64 0 %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer ret <1 x double> %splat @@ -273,6 +396,12 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) { ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.2d, v0.d[0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x double> undef, double %a, i64 0 %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer ret <2 x double> %splat @@ -285,6 +414,13 @@ define void @splat_v4f64(double %a, ptr %b) { ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: dup v0.2d, v0.d[0] +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x double> undef, double %a, i64 0 %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer store <4 x double> %splat, ptr %b @@ -301,6 +437,12 @@ define void @splat_imm_v32i8(ptr %a) { ; CHECK-NEXT: mov z0.b, #1 // =0x1 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_imm_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.16b, #1 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <32 x i8> undef, i8 1, i64 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer store <32 x i8> %splat, ptr %a @@ -313,6 +455,13 @@ define void @splat_imm_v16i16(ptr %a) { ; CHECK-NEXT: mov z0.h, #2 // =0x2 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_imm_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #2 // =0x2 +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x i16> undef, i16 2, i64 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer store <16 x i16> %splat, ptr %a @@ -325,6 +474,13 @@ define void @splat_imm_v8i32(ptr %a) { ; CHECK-NEXT: mov z0.s, #3 // =0x3 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_imm_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #3 // =0x3 +; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i32> undef, i32 3, i64 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer store <8 x i32> %splat, ptr %a @@ -337,6 +493,13 @@ define void @splat_imm_v4i64(ptr %a) { ; CHECK-NEXT: mov z0.d, #4 // =0x4 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_imm_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #4 // =0x4 +; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i64> undef, i64 4, i64 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer store <4 x i64> %splat, ptr %a @@ -353,6 +516,13 @@ define void @splat_imm_v16f16(ptr %a) { ; CHECK-NEXT: fmov z0.h, #5.00000000 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_imm_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov w8, #17664 // =0x4500 +; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x half> undef, half 5.0, i64 0 %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer store <16 x half> %splat, ptr %a @@ -365,6 +535,12 @@ define void @splat_imm_v8f32(ptr %a) { ; CHECK-NEXT: fmov z0.s, #6.00000000 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_imm_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov v0.4s, #6.00000000 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x float> undef, float 6.0, i64 0 %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer store <8 x float> %splat, ptr %a @@ -377,6 +553,12 @@ define void @splat_imm_v4f64(ptr %a) { ; CHECK-NEXT: fmov z0.d, #7.00000000 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: splat_imm_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov v0.2d, #7.00000000 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x double> undef, double 7.0, i64 0 %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer store <4 x double> %splat, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll index c7435bdbec9497..f055061b13bed6 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -12,6 +13,11 @@ define void @store_v4i8(ptr %a) { ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: st1b { z0.h }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str wzr, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x i8> zeroinitializer, ptr %a ret void } @@ -22,6 +28,12 @@ define void @store_v8i8(ptr %a) { ; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret store <8 x i8> zeroinitializer, ptr %a ret void } @@ -32,6 +44,12 @@ define void @store_v16i8(ptr %a) { ; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret store <16 x i8> zeroinitializer, ptr %a ret void } @@ -42,6 +60,12 @@ define void @store_v32i8(ptr %a) { ; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <32 x i8> zeroinitializer, ptr %a ret void } @@ -53,6 +77,11 @@ define void @store_v2i16(ptr %a) { ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str wzr, [x0] +; NONEON-NOSVE-NEXT: ret store <2 x i16> zeroinitializer, ptr %a ret void } @@ -64,6 +93,11 @@ define void @store_v2f16(ptr %a) { ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v2f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str wzr, [x0] +; NONEON-NOSVE-NEXT: ret store <2 x half> zeroinitializer, ptr %a ret void } @@ -74,6 +108,12 @@ define void @store_v4i16(ptr %a) { ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x i16> zeroinitializer, ptr %a ret void } @@ -84,6 +124,12 @@ define void @store_v4f16(ptr %a) { ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x half> zeroinitializer, ptr %a ret void } @@ -94,6 +140,12 @@ define void @store_v8i16(ptr %a) { ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret store <8 x i16> zeroinitializer, ptr %a ret void } @@ -104,6 +156,12 @@ define void @store_v8f16(ptr %a) { ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: ret store <8 x half> zeroinitializer, ptr %a ret void } @@ -114,6 +172,12 @@ define void @store_v16i16(ptr %a) { ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <16 x i16> zeroinitializer, ptr %a ret void } @@ -124,6 +188,12 @@ define void @store_v16f16(ptr %a) { ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <16 x half> zeroinitializer, ptr %a ret void } @@ -133,6 +203,11 @@ define void @store_v2i32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: str xzr, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str xzr, [x0] +; NONEON-NOSVE-NEXT: ret store <2 x i32> zeroinitializer, ptr %a ret void } @@ -142,6 +217,11 @@ define void @store_v2f32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: str xzr, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str xzr, [x0] +; NONEON-NOSVE-NEXT: ret store <2 x float> zeroinitializer, ptr %a ret void } @@ -151,6 +231,11 @@ define void @store_v4i32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: stp xzr, xzr, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp xzr, xzr, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x i32> zeroinitializer, ptr %a ret void } @@ -160,6 +245,11 @@ define void @store_v4f32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: stp xzr, xzr, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp xzr, xzr, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x float> zeroinitializer, ptr %a ret void } @@ -170,6 +260,12 @@ define void @store_v8i32(ptr %a) { ; CHECK-NEXT: mov z0.s, #0 // =0x0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <8 x i32> zeroinitializer, ptr %a ret void } @@ -180,6 +276,12 @@ define void @store_v8f32(ptr %a) { ; CHECK-NEXT: mov z0.s, #0 // =0x0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <8 x float> zeroinitializer, ptr %a ret void } @@ -190,6 +292,12 @@ define void @store_v1i64(ptr %a) { ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v1i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret store <1 x i64> zeroinitializer, ptr %a ret void } @@ -200,6 +308,12 @@ define void @store_v1f64(ptr %a) { ; CHECK-NEXT: fmov d0, xzr ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v1f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: ret store <1 x double> zeroinitializer, ptr %a ret void } @@ -209,6 +323,11 @@ define void @store_v2i64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: stp xzr, xzr, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp xzr, xzr, [x0] +; NONEON-NOSVE-NEXT: ret store <2 x i64> zeroinitializer, ptr %a ret void } @@ -218,6 +337,11 @@ define void @store_v2f64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: stp xzr, xzr, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp xzr, xzr, [x0] +; NONEON-NOSVE-NEXT: ret store <2 x double> zeroinitializer, ptr %a ret void } @@ -228,6 +352,12 @@ define void @store_v4i64(ptr %a) { ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x i64> zeroinitializer, ptr %a ret void } @@ -238,6 +368,12 @@ define void @store_v4f64(ptr %a) { ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: stp q0, q0, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: ret store <4 x double> zeroinitializer, ptr %a ret void } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll index 9e04fc236836cc..80c9ef87e9b915 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE ; Test we can code generater patterns of the form: @@ -23,6 +24,12 @@ define void @subvector_v4i8(ptr %in, ptr %out) { ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: st1b { z0.h }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v4i8: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <4 x i8>, ptr %in br label %bb1 @@ -37,6 +44,12 @@ define void @subvector_v8i8(ptr %in, ptr %out) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v8i8: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <8 x i8>, ptr %in br label %bb1 @@ -51,6 +64,12 @@ define void @subvector_v16i8(ptr %in, ptr %out) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v16i8: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <16 x i8>, ptr %in br label %bb1 @@ -65,6 +84,12 @@ define void @subvector_v32i8(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v32i8: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in br label %bb1 @@ -81,6 +106,12 @@ define void @subvector_v2i16(ptr %in, ptr %out) { ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v2i16: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <2 x i16>, ptr %in br label %bb1 @@ -95,6 +126,12 @@ define void @subvector_v4i16(ptr %in, ptr %out) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v4i16: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <4 x i16>, ptr %in br label %bb1 @@ -109,6 +146,12 @@ define void @subvector_v8i16(ptr %in, ptr %out) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v8i16: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <8 x i16>, ptr %in br label %bb1 @@ -123,6 +166,12 @@ define void @subvector_v16i16(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v16i16: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in br label %bb1 @@ -138,6 +187,12 @@ define void @subvector_v2i32(ptr %in, ptr %out) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v2i32: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <2 x i32>, ptr %in br label %bb1 @@ -152,6 +207,12 @@ define void @subvector_v4i32(ptr %in, ptr %out) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v4i32: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %in br label %bb1 @@ -166,6 +227,12 @@ define void @subvector_v8i32(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v8i32: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in br label %bb1 @@ -181,6 +248,12 @@ define void @subvector_v2i64(ptr %in, ptr %out) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v2i64: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %in br label %bb1 @@ -195,6 +268,12 @@ define void @subvector_v4i64(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v4i64: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in br label %bb1 @@ -210,6 +289,12 @@ define void @subvector_v2f16(ptr %in, ptr %out) { ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v2f16: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <2 x half>, ptr %in br label %bb1 @@ -224,6 +309,12 @@ define void @subvector_v4f16(ptr %in, ptr %out) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v4f16: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %in br label %bb1 @@ -238,6 +329,12 @@ define void @subvector_v8f16(ptr %in, ptr %out) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v8f16: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <8 x half>, ptr %in br label %bb1 @@ -252,6 +349,12 @@ define void @subvector_v16f16(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v16f16: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <16 x half>, ptr %in br label %bb1 @@ -267,6 +370,12 @@ define void @subvector_v2f32(ptr %in, ptr %out) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v2f32: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <2 x float>, ptr %in br label %bb1 @@ -281,6 +390,12 @@ define void @subvector_v4f32(ptr %in, ptr %out) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v4f32: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <4 x float>, ptr %in br label %bb1 @@ -295,6 +410,12 @@ define void @subvector_v8f32(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v8f32: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <8 x float>,ptr %in br label %bb1 @@ -310,6 +431,12 @@ define void @subvector_v2f64(ptr %in, ptr %out) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v2f64: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <2 x double>, ptr %in br label %bb1 @@ -324,6 +451,12 @@ define void @subvector_v4f64(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: subvector_v4f64: +; NONEON-NOSVE: // %bb.0: // %bb1 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <4 x double>, ptr %in br label %bb1 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll index b34fe438a063a9..41b68e10e75ded 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -12,6 +13,13 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: st1b { z0.h }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_trunc_v8i16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <8 x i16>, ptr %ap %val = trunc <8 x i16> %a to <8 x i8> store <8 x i8> %val, ptr %dest @@ -25,6 +33,14 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: st1b { z0.s }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_trunc_v4i32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %ap %val = trunc <4 x i32> %a to <4 x i8> store <4 x i8> %val, ptr %dest @@ -38,6 +54,13 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: st1h { z0.s }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_trunc_v4i32i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %ap %val = trunc <4 x i32> %a to <4 x i16> store <4 x i16> %val, ptr %dest @@ -51,6 +74,13 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: st1w { z0.d }, p0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_trunc_v2i64i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %ap %val = trunc <2 x i64> %a to <2 x i32> store <2 x i32> %val, ptr %dest @@ -66,6 +96,14 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) { ; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d ; CHECK-NEXT: str q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: store_trunc_v2i256i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr d0, [x0, #32] +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: mov v1.d[1], v0.d[0] +; NONEON-NOSVE-NEXT: str q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <2 x i256>, ptr %ap %val = trunc <2 x i256> %a to <2 x i64> store <2 x i64> %val, ptr %dest diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll index 9e56462df38890..8242b4e26d5057 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -19,6 +20,12 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind { ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v16i16_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = trunc <16 x i16> %a to <16 x i8> ret <16 x i8> %b @@ -41,6 +48,17 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: add z1.b, z2.b, z2.b ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <32 x i16>, ptr %in %b = trunc <32 x i16> %a to <32 x i8> %c = add <32 x i8> %b, %b @@ -76,6 +94,24 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: stp q0, q1, [x1, #32] ; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v64i16_v64i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: ldp q6, q1, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v2.16b, v3.16b, v2.16b +; NONEON-NOSVE-NEXT: uzp1 v3.16b, v5.16b, v4.16b +; NONEON-NOSVE-NEXT: uzp1 v1.16b, v6.16b, v1.16b +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v2.16b, v2.16b, v2.16b +; NONEON-NOSVE-NEXT: add v3.16b, v3.16b, v3.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] +; NONEON-NOSVE-NEXT: stp q3, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <64 x i16>, ptr %in %b = trunc <64 x i16> %a to <64 x i8> %c = add <64 x i8> %b, %b @@ -133,6 +169,38 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: stp q2, q3, [x1, #32] ; CHECK-NEXT: stp q4, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #224] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #128] +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: ldp q16, q1, [x0, #160] +; NONEON-NOSVE-NEXT: uzp1 v4.16b, v5.16b, v4.16b +; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] +; NONEON-NOSVE-NEXT: uzp1 v6.16b, v7.16b, v6.16b +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q18, q7, [x0, #96] +; NONEON-NOSVE-NEXT: uzp1 v1.16b, v16.16b, v1.16b +; NONEON-NOSVE-NEXT: uzp1 v5.16b, v17.16b, v5.16b +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v2.16b, v3.16b, v2.16b +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v4.16b, v4.16b, v4.16b +; NONEON-NOSVE-NEXT: uzp1 v7.16b, v18.16b, v7.16b +; NONEON-NOSVE-NEXT: add v3.16b, v6.16b, v6.16b +; NONEON-NOSVE-NEXT: uzp1 v6.16b, v17.16b, v16.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #96] +; NONEON-NOSVE-NEXT: add v0.16b, v5.16b, v5.16b +; NONEON-NOSVE-NEXT: add v2.16b, v2.16b, v2.16b +; NONEON-NOSVE-NEXT: add v4.16b, v7.16b, v7.16b +; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #64] +; NONEON-NOSVE-NEXT: add v1.16b, v6.16b, v6.16b +; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q2, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <128 x i16>, ptr %in %b = trunc <128 x i16> %a to <128 x i8> %c = add <128 x i8> %b, %b @@ -155,6 +223,13 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind { ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v8i32_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h +; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = trunc <8 x i32> %a to <8 x i8> ret <8 x i8> %b @@ -178,6 +253,15 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind { ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v16i32_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ret %a = load <16 x i32>, ptr %in %b = trunc <16 x i32> %a to <16 x i8> ret <16 x i8> %b @@ -215,6 +299,23 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: add z1.b, z3.b, z3.b ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v32i32_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NONEON-NOSVE-NEXT: uzp1 v3.8h, v5.8h, v4.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v7.8h, v6.8h +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <32 x i32>, ptr %in %b = trunc <32 x i32> %a to <32 x i8> %c = add <32 x i8> %b, %b @@ -279,6 +380,36 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: stp q3, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #128] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #160] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #224] +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NONEON-NOSVE-NEXT: ldp q3, q1, [x0] +; NONEON-NOSVE-NEXT: uzp1 v4.8h, v5.8h, v4.8h +; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] +; NONEON-NOSVE-NEXT: uzp1 v6.8h, v7.8h, v6.8h +; NONEON-NOSVE-NEXT: ldp q16, q7, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #96] +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v1.8h +; NONEON-NOSVE-NEXT: uzp1 v5.8h, v17.8h, v5.8h +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: uzp1 v7.8h, v16.8h, v7.8h +; NONEON-NOSVE-NEXT: uzp1 v3.8h, v19.8h, v18.8h +; NONEON-NOSVE-NEXT: uzp1 v2.16b, v4.16b, v6.16b +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: uzp1 v1.16b, v1.16b, v7.16b +; NONEON-NOSVE-NEXT: uzp1 v3.16b, v5.16b, v3.16b +; NONEON-NOSVE-NEXT: add v2.16b, v2.16b, v2.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add v3.16b, v3.16b, v3.16b +; NONEON-NOSVE-NEXT: stp q1, q3, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i8> %c = add <64 x i8> %b, %b @@ -300,6 +431,12 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v8i32_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = trunc <8 x i32> %a to <8 x i16> ret <8 x i16> %b @@ -322,6 +459,17 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: add z1.h, z2.h, z2.h ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v2.8h +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <16 x i32>, ptr %in %b = trunc <16 x i32> %a to <16 x i16> %c = add <16 x i16> %b, %b @@ -357,6 +505,24 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: stp q0, q1, [x1, #32] ; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v32i32_v32i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: ldp q6, q1, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NONEON-NOSVE-NEXT: uzp1 v3.8h, v5.8h, v4.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v6.8h, v1.8h +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h +; NONEON-NOSVE-NEXT: add v2.8h, v2.8h, v2.8h +; NONEON-NOSVE-NEXT: add v3.8h, v3.8h, v3.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] +; NONEON-NOSVE-NEXT: stp q3, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <32 x i32>, ptr %in %b = trunc <32 x i32> %a to <32 x i16> %c = add <32 x i16> %b, %b @@ -414,6 +580,38 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: stp q2, q3, [x1, #32] ; CHECK-NEXT: stp q4, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #224] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #128] +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: ldp q16, q1, [x0, #160] +; NONEON-NOSVE-NEXT: uzp1 v4.8h, v5.8h, v4.8h +; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] +; NONEON-NOSVE-NEXT: uzp1 v6.8h, v7.8h, v6.8h +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q18, q7, [x0, #96] +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v16.8h, v1.8h +; NONEON-NOSVE-NEXT: uzp1 v5.8h, v17.8h, v5.8h +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h +; NONEON-NOSVE-NEXT: add v4.8h, v4.8h, v4.8h +; NONEON-NOSVE-NEXT: uzp1 v7.8h, v18.8h, v7.8h +; NONEON-NOSVE-NEXT: add v3.8h, v6.8h, v6.8h +; NONEON-NOSVE-NEXT: uzp1 v6.8h, v17.8h, v16.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #96] +; NONEON-NOSVE-NEXT: add v0.8h, v5.8h, v5.8h +; NONEON-NOSVE-NEXT: add v2.8h, v2.8h, v2.8h +; NONEON-NOSVE-NEXT: add v4.8h, v7.8h, v7.8h +; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #64] +; NONEON-NOSVE-NEXT: add v1.8h, v6.8h, v6.8h +; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q2, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i16> %c = add <64 x i16> %b, %b @@ -437,6 +635,13 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v4i64_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in %b = trunc <4 x i64> %a to <4 x i8> ret <4 x i8> %b @@ -461,6 +666,16 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind { ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v8i64_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h +; NONEON-NOSVE-NEXT: ret %a = load <8 x i64>, ptr %in %b = trunc <8 x i64> %a to <8 x i8> ret <8 x i8> %b @@ -499,6 +714,21 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind { ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v16i64_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #64] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: uzp1 v3.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v2.8h +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i8> ret <16 x i8> %b @@ -565,6 +795,35 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #224] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v1.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #128] +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #160] +; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #64] +; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: uzp1 v16.4s, v17.4s, v16.4s +; NONEON-NOSVE-NEXT: uzp1 v5.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: uzp1 v7.4s, v19.4s, v18.4s +; NONEON-NOSVE-NEXT: uzp1 v6.4s, v21.4s, v20.4s +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v4.8h, v16.8h +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v2.8h, v7.8h +; NONEON-NOSVE-NEXT: uzp1 v3.8h, v6.8h, v5.8h +; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: uzp1 v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b +; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i8> %c = add <32 x i8> %b, %b @@ -587,6 +846,13 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind { ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v4i64_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in %b = trunc <4 x i64> %a to <4 x i16> ret <4 x i16> %b @@ -610,6 +876,15 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind { ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v8i64_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: ret %a = load <8 x i64>, ptr %in %b = trunc <8 x i64> %a to <8 x i16> ret <8 x i16> %b @@ -647,6 +922,23 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: add z1.h, z3.h, z3.h ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v16i64_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: uzp1 v1.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v1.8h +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i16> %c = add <16 x i16> %b, %b @@ -711,6 +1003,36 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: stp q3, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #128] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #160] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #224] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: ldp q3, q1, [x0] +; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] +; NONEON-NOSVE-NEXT: uzp1 v6.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: ldp q16, q7, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #96] +; NONEON-NOSVE-NEXT: uzp1 v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: uzp1 v5.4s, v17.4s, v5.4s +; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: uzp1 v7.4s, v16.4s, v7.4s +; NONEON-NOSVE-NEXT: uzp1 v3.4s, v19.4s, v18.4s +; NONEON-NOSVE-NEXT: uzp1 v2.8h, v4.8h, v6.8h +; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h +; NONEON-NOSVE-NEXT: uzp1 v1.8h, v1.8h, v7.8h +; NONEON-NOSVE-NEXT: uzp1 v3.8h, v5.8h, v3.8h +; NONEON-NOSVE-NEXT: add v2.8h, v2.8h, v2.8h +; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] +; NONEON-NOSVE-NEXT: add v3.8h, v3.8h, v3.8h +; NONEON-NOSVE-NEXT: stp q1, q3, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i16> %c = add <32 x i16> %b, %b @@ -732,6 +1054,12 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind { ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v4i64_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in %b = trunc <4 x i64> %a to <4 x i32> ret <4 x i32> %b @@ -754,6 +1082,17 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: add z1.s, z2.s, z2.s ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: uzp1 v1.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <8 x i64>, ptr %in %b = trunc <8 x i64> %a to <8 x i32> %c = add <8 x i32> %b, %b @@ -789,6 +1128,24 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: stp q0, q1, [x1, #32] ; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v16i64_v16i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: ldp q6, q1, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: uzp1 v1.4s, v6.4s, v1.4s +; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: add v2.4s, v2.4s, v2.4s +; NONEON-NOSVE-NEXT: add v3.4s, v3.4s, v3.4s +; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] +; NONEON-NOSVE-NEXT: stp q3, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i32> %c = add <16 x i32> %b, %b @@ -846,6 +1203,38 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: stp q2, q3, [x1, #32] ; CHECK-NEXT: stp q4, q0, [x1] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #224] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #128] +; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: ldp q16, q1, [x0, #160] +; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] +; NONEON-NOSVE-NEXT: uzp1 v6.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q18, q7, [x0, #96] +; NONEON-NOSVE-NEXT: uzp1 v1.4s, v16.4s, v1.4s +; NONEON-NOSVE-NEXT: uzp1 v5.4s, v17.4s, v5.4s +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #32] +; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s +; NONEON-NOSVE-NEXT: add v4.4s, v4.4s, v4.4s +; NONEON-NOSVE-NEXT: uzp1 v7.4s, v18.4s, v7.4s +; NONEON-NOSVE-NEXT: add v3.4s, v6.4s, v6.4s +; NONEON-NOSVE-NEXT: uzp1 v6.4s, v17.4s, v16.4s +; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #96] +; NONEON-NOSVE-NEXT: add v0.4s, v5.4s, v5.4s +; NONEON-NOSVE-NEXT: add v2.4s, v2.4s, v2.4s +; NONEON-NOSVE-NEXT: add v4.4s, v7.4s, v7.4s +; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #64] +; NONEON-NOSVE-NEXT: add v1.4s, v6.4s, v6.4s +; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q2, q1, [x1] +; NONEON-NOSVE-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i32> %c = add <32 x i32> %b, %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll index 304823c9e64145..874af15e211177 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -14,6 +15,12 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v1.8b, v0.8b, v0.8b, #6 +; NONEON-NOSVE-NEXT: trn1 v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> ret <4 x i8> %ret } @@ -28,6 +35,11 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: insr z1.b, w8 ; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #7 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> ret <8 x i8> %ret } @@ -42,6 +54,11 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: insr z1.b, w8 ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #15 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> ret <16 x i8> %ret @@ -60,6 +77,15 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: insr z3.b, w8 ; CHECK-NEXT: stp q1, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #15 +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #15 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %ret = shufflevector <32 x i8> %op1, <32 x i8> %op2, <32 x i32> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: rev64 v0.2s, v0.2s +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> ret <2 x i16> %ret } @@ -92,6 +123,11 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: insr z1.h, w8 ; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #6 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> ret <4 x i16> %ret } @@ -106,6 +142,11 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: insr z1.h, w8 ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> ret <8 x i16> %ret } @@ -123,6 +164,15 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: insr z3.h, w8 ; CHECK-NEXT: stp q1, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #14 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b %ret = shufflevector <16 x i16> %op1, <16 x i16> %op2, <16 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: insr z1.s, w8 ; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> ret <2 x i32> %ret } @@ -155,6 +210,11 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: insr z1.s, w8 ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> ret <4 x i32> %ret } @@ -172,6 +232,15 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: insr z3.s, w8 ; CHECK-NEXT: stp q1, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #12 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b %ret = shufflevector <8 x i32> %op1, <8 x i32> %op2, <8 x i32> @@ -189,6 +258,11 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: insr z1.d, x8 ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> ret <2 x i64> %ret } @@ -206,6 +280,15 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: insr z3.d, x8 ; CHECK-NEXT: stp q1, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #8 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %ret = shufflevector <4 x i64> %op1, <4 x i64> %op2, <4 x i32> @@ -223,6 +306,11 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-NEXT: insr z0.h, h2 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #6 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> ret <4 x half> %ret } @@ -236,6 +324,11 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) { ; CHECK-NEXT: insr z0.h, h2 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> ret <8 x half> %ret } @@ -251,6 +344,15 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: insr z3.h, h2 ; CHECK-NEXT: stp q1, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #14 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %ret = shufflevector <16 x half> %op1, <16 x half> %op2, <16 x i32> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2) ; CHECK-NEXT: insr z0.s, s2 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> ret <2 x float> %ret } @@ -281,6 +388,11 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2) ; CHECK-NEXT: insr z0.s, s2 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> ret <4 x float> %ret } @@ -296,6 +408,15 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: insr z3.s, s2 ; CHECK-NEXT: stp q1, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #12 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b %ret = shufflevector <8 x float> %op1, <8 x float> %op2, <8 x i32> @@ -312,6 +433,11 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op ; CHECK-NEXT: insr z0.d, d2 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> ret <2 x double> %ret } @@ -327,6 +453,15 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: insr z3.d, d2 ; CHECK-NEXT: stp q1, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #8 +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> @@ -345,6 +480,15 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) { ; CHECK-NEXT: insr z3.d, d2 ; CHECK-NEXT: stp q1, q3, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_byone_reverse: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] +; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v2.16b, #8 +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> @@ -359,6 +503,13 @@ define void @shuffle_ext_invalid(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: shuffle_ext_invalid: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll index 6c9c0556056684..e69f59aedc026f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -11,6 +12,11 @@ define fp128 @test_streaming_compatible_register_mov(fp128 %q0, fp128 %q1) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: test_streaming_compatible_register_mov: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: mov v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ret ret fp128 %q1 } @@ -20,6 +26,11 @@ define double @fp_zero_constant() { ; CHECK: // %bb.0: ; CHECK-NEXT: fmov d0, xzr ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fp_zero_constant: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: fmov d0, xzr +; NONEON-NOSVE-NEXT: ret ret double 0.0 } @@ -29,6 +40,11 @@ define <2 x i64> @fixed_vec_zero_constant() { ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fixed_vec_zero_constant: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: ret ret <2 x i64> zeroinitializer } @@ -38,5 +54,10 @@ define <2 x double> @fixed_vec_fp_zero_constant() { ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: ret ret <2 x double> } From 3864bfd2e0ce7e32fd623c550660885599383e6a Mon Sep 17 00:00:00 2001 From: Daniil Fukalov <1671137+dfukalov@users.noreply.github.com> Date: Tue, 28 May 2024 16:09:53 +0200 Subject: [PATCH 16/89] [IR] Fix ignoring `non-global-value-max-name-size` in `ValueSymbolTable::makeUniqueName()`. (#89057) E.g. during inlining new symbol name can be duplicated and then `ValueSymbolTable::makeUniqueName()` will add unique suffix, exceeding the `non-global-value-max-name-size` restriction. Also fixed `unsigned` type of the option to `int` since `ValueSymbolTable`' constructor can use `-1` value that means unrestricted name size. --- llvm/lib/IR/Function.cpp | 2 +- llvm/lib/IR/ValueSymbolTable.cpp | 33 ++++++++++++------- .../non-global-value-max-name-size-2.ll | 23 +++++++++++++ llvm/test/Bitcode/value-with-long-name-dbg.ll | 11 +++++++ llvm/test/Bitcode/value-with-long-name.ll | 4 +-- 5 files changed, 58 insertions(+), 15 deletions(-) create mode 100644 llvm/test/Assembler/non-global-value-max-name-size-2.ll create mode 100644 llvm/test/Bitcode/value-with-long-name-dbg.ll diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index bd06ff82a15a58..13fa1afeaaff24 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -79,7 +79,7 @@ using ProfileCount = Function::ProfileCount; // are not in the public header file... template class llvm::SymbolTableListTraits; -static cl::opt NonGlobalValueMaxNameSize( +static cl::opt NonGlobalValueMaxNameSize( "non-global-value-max-name-size", cl::Hidden, cl::init(1024), cl::desc("Maximum size for the name of non-global values.")); diff --git a/llvm/lib/IR/ValueSymbolTable.cpp b/llvm/lib/IR/ValueSymbolTable.cpp index 52f7ddcdc65a2b..a020acf22a96c5 100644 --- a/llvm/lib/IR/ValueSymbolTable.cpp +++ b/llvm/lib/IR/ValueSymbolTable.cpp @@ -43,23 +43,34 @@ ValueSymbolTable::~ValueSymbolTable() { ValueName *ValueSymbolTable::makeUniqueName(Value *V, SmallString<256> &UniqueName) { unsigned BaseSize = UniqueName.size(); + bool AppenDot = false; + if (auto *GV = dyn_cast(V)) { + // A dot is appended to mark it as clone during ABI demangling so that + // for example "_Z1fv" and "_Z1fv.1" both demangle to "f()", the second + // one being a clone. + // On NVPTX we cannot use a dot because PTX only allows [A-Za-z0-9_$] for + // identifiers. This breaks ABI demangling but at least ptxas accepts and + // compiles the program. + const Module *M = GV->getParent(); + if (!(M && Triple(M->getTargetTriple()).isNVPTX())) + AppenDot = true; + } + while (true) { // Trim any suffix off and append the next number. UniqueName.resize(BaseSize); raw_svector_ostream S(UniqueName); - if (auto *GV = dyn_cast(V)) { - // A dot is appended to mark it as clone during ABI demangling so that - // for example "_Z1fv" and "_Z1fv.1" both demangle to "f()", the second - // one being a clone. - // On NVPTX we cannot use a dot because PTX only allows [A-Za-z0-9_$] for - // identifiers. This breaks ABI demangling but at least ptxas accepts and - // compiles the program. - const Module *M = GV->getParent(); - if (!(M && Triple(M->getTargetTriple()).isNVPTX())) - S << "."; - } + if (AppenDot) + S << "."; S << ++LastUnique; + // Retry if MaxNameSize has been exceeded. + if (MaxNameSize > -1 && UniqueName.size() > (size_t)MaxNameSize) { + assert(BaseSize >= UniqueName.size() - (size_t)MaxNameSize && + "Can't generate unique name: MaxNameSize is too small."); + BaseSize -= UniqueName.size() - (size_t)MaxNameSize; + continue; + } // Try insert the vmap entry with this suffix. auto IterBool = vmap.insert(std::make_pair(UniqueName.str(), V)); if (IterBool.second) diff --git a/llvm/test/Assembler/non-global-value-max-name-size-2.ll b/llvm/test/Assembler/non-global-value-max-name-size-2.ll new file mode 100644 index 00000000000000..5eac003ddb4383 --- /dev/null +++ b/llvm/test/Assembler/non-global-value-max-name-size-2.ll @@ -0,0 +1,23 @@ +; RUN: opt < %s -S -passes='always-inline' -non-global-value-max-name-size=5 | opt -non-global-value-max-name-size=5 -passes=verify -disable-output + +; Opt should not generate too long name for labels during inlining. + +define internal i32 @inner(i32 %flag) alwaysinline { +entry: + %icmp = icmp slt i32 %flag, 0 + br i1 %icmp, label %one, label %two + +one: + ret i32 42 + +two: + ret i32 44 +} + +define i32 @outer(i32 %x) { +entry: + %call1 = call i32 @inner(i32 %x) + %call2 = call i32 @inner(i32 %x) + %ret = add i32 %call1, %call2 + ret i32 %ret +} \ No newline at end of file diff --git a/llvm/test/Bitcode/value-with-long-name-dbg.ll b/llvm/test/Bitcode/value-with-long-name-dbg.ll new file mode 100644 index 00000000000000..0cc3569d8617b3 --- /dev/null +++ b/llvm/test/Bitcode/value-with-long-name-dbg.ll @@ -0,0 +1,11 @@ +; REQUIRES: asserts +; Force the size to be small to check assertion message. +; RUN: not --crash opt -S %s -O2 -o - -non-global-value-max-name-size=0 2>&1 | FileCheck %s +; CHECK: Can't generate unique name: MaxNameSize is too small. + +define i32 @f(i32 %a, i32 %b) { + %c = add i32 %a, %b + %d = add i32 %c, %a + %e = add i32 %d, %b + ret i32 %e +} diff --git a/llvm/test/Bitcode/value-with-long-name.ll b/llvm/test/Bitcode/value-with-long-name.ll index 1ca5d133e09ae3..aa7da5f5b7dba9 100644 --- a/llvm/test/Bitcode/value-with-long-name.ll +++ b/llvm/test/Bitcode/value-with-long-name.ll @@ -1,10 +1,10 @@ ; Check the size of generated variable when no option is set ; RUN: opt -S %s -O2 -o - | FileCheck -check-prefix=CHECK-LONG %s +; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=-1 | FileCheck -check-prefix=CHECK-LONG %s ; CHECK-LONG: %{{[a-z]{4}[a-z]+}} ; Then check we correctly cap the size of newly generated non-global values name ; Force the size to be small so that the check works on release and debug build -; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=0 | FileCheck -check-prefix=CHECK-SHORT %s ; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=1 | FileCheck -check-prefix=CHECK-SHORT %s ; CHECK-SHORT-NOT: %{{[a-z][a-z]+}} @@ -14,5 +14,3 @@ define i32 @f(i32 %a, i32 %b) { %e = add i32 %d, %b ret i32 %e } - - From d2a103e682d65c3bfdff1d6a6f7b114e6cf4ff76 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 May 2024 07:31:29 -0700 Subject: [PATCH 17/89] [memprof] Remove const from the return type of toMemProfRecord (#93415) "const" being removed in this patch prevents the move semantics from being used in: AI.CallStack = Callback(IndexedAI.CSId); With this patch on an indexed MemProf Version 2 profile, the cycle count and instruction count go down by 13.3% and 26.3%, respectively, with "llvm-profdata show" modified to deserialize all MemProfRecords. --- llvm/include/llvm/ProfileData/MemProf.h | 4 ++-- llvm/lib/ProfileData/MemProf.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 66a99f16cdb638..17cef15344285b 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -426,8 +426,8 @@ struct IndexedMemProfRecord { // Convert IndexedMemProfRecord to MemProfRecord. Callback is used to // translate CallStackId to call stacks with frames inline. MemProfRecord toMemProfRecord( - llvm::function_ref(const CallStackId)> - Callback) const; + llvm::function_ref(const CallStackId)> Callback) + const; // Returns the GUID for the function name after canonicalization. For // memprof, we remove any .llvm suffix added by LTO. MemProfRecords are diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index e5608644519db4..89afe7c39027c6 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -245,8 +245,8 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema, } MemProfRecord IndexedMemProfRecord::toMemProfRecord( - llvm::function_ref(const CallStackId)> - Callback) const { + llvm::function_ref(const CallStackId)> Callback) + const { MemProfRecord Record; Record.AllocSites.reserve(AllocSites.size()); From 74ed79f7f123788d95f1552800e1af9ceaee4a08 Mon Sep 17 00:00:00 2001 From: Ryan Holt Date: Tue, 28 May 2024 10:42:32 -0400 Subject: [PATCH 18/89] [mlir][linalg] Add linalg.transpose constant folding (#92589) There was existing support for constant folding a `linalg.generic` that was actually a transpose. This commit adds support for the named op, `linalg.transpose`, as well by making use of the `LinalgOp` interface. --- .../Linalg/Transforms/ConstantFold.cpp | 62 ++++---- mlir/test/Dialect/Linalg/constant-fold.mlir | 148 ++++++++++++++++++ .../Linalg/fusion-elementwise-ops.mlir | 133 ---------------- 3 files changed, 180 insertions(+), 163 deletions(-) create mode 100644 mlir/test/Dialect/Linalg/constant-fold.mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp index 8fffabf11f3fdd..2e6079e1402e1d 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp @@ -23,21 +23,21 @@ using namespace mlir; using namespace mlir::linalg; namespace { -/// Base class for constant folding linalg.generic ops with N inputs, 1 output, -/// and permutation indexing maps. +/// Base class for constant folding linalg structured ops with N inputs, 1 +/// output, and permutation indexing maps. /// /// `ConcreteType` should provide methods with signatures /// /// ```c++ -/// bool matchIndexingMaps(GenericOp genericOp) const; -/// RegionComputationFn getRegionComputeFn(GenericOp) const; +/// bool matchIndexingMaps(LinalgOp linalgOp) const; +/// RegionComputationFn getRegionComputeFn(LinalgOp) const; /// ``` /// /// The latter inspects the region and returns the computation inside as a /// functor. The functor will be invoked with constant elements for all inputs /// and should return the corresponding computed constant element for output. template -class FoldConstantBase : public OpRewritePattern { +class FoldConstantBase : public OpInterfaceRewritePattern { public: struct APIntOrFloat { std::optional apInt; @@ -52,25 +52,26 @@ class FoldConstantBase : public OpRewritePattern { FoldConstantBase(MLIRContext *context, const ControlFusionFn &controlFn, PatternBenefit benefit = 1) - : OpRewritePattern(context, benefit), controlFn(controlFn) {} + : OpInterfaceRewritePattern(context, benefit), + controlFn(controlFn) {} - LogicalResult matchAndRewrite(GenericOp genericOp, + LogicalResult matchAndRewrite(LinalgOp linalgOp, PatternRewriter &rewriter) const override { // Mixed and buffer sematics aren't supported. - if (!genericOp.hasPureTensorSemantics()) + if (!linalgOp.hasPureTensorSemantics()) return failure(); // Only support ops generating one output for now. - if (genericOp.getNumDpsInits() != 1) + if (linalgOp.getNumDpsInits() != 1) return failure(); - auto outputType = dyn_cast(genericOp.getResultTypes().front()); + auto outputType = dyn_cast(linalgOp->getResultTypes().front()); // Require the output types to be static given that we are generating // constants. if (!outputType || !outputType.hasStaticShape()) return failure(); - if (!llvm::all_of(genericOp.getInputs(), [](Value input) { + if (!llvm::all_of(linalgOp.getDpsInputs(), [](Value input) { return isa(input.getType()); })) return failure(); @@ -80,7 +81,7 @@ class FoldConstantBase : public OpRewritePattern { return cast(value.getType()).getElementType(); }; if (!llvm::all_equal( - llvm::map_range(genericOp->getOperands(), getOperandElementType))) + llvm::map_range(linalgOp->getOperands(), getOperandElementType))) return failure(); // We can only handle the case where we have int/float elements. @@ -93,30 +94,30 @@ class FoldConstantBase : public OpRewritePattern { // entirely in the compiler, without needing to turn all indices into // Values, and then do affine apply on them, and then match back the // constant again. - if (!llvm::all_of(genericOp.getIndexingMapsArray(), + if (!llvm::all_of(linalgOp.getIndexingMapsArray(), [](AffineMap map) { return map.isPermutation(); })) return failure(); - for (OpOperand &operand : genericOp.getDpsInitsMutable()) { - if (genericOp.payloadUsesValueFromOperand(&operand)) + for (OpOperand &operand : linalgOp.getDpsInitsMutable()) { + if (linalgOp.payloadUsesValueFromOperand(&operand)) return failure(); } // Further check the indexing maps are okay for the ConcreteType. - if (!static_cast(this)->matchIndexingMaps(genericOp)) + if (!static_cast(this)->matchIndexingMaps(linalgOp)) return failure(); // Defer to the concrete type to check the region and discover the // computation inside. RegionComputationFn computeFn = - static_cast(this)->getRegionComputeFn(genericOp); + static_cast(this)->getRegionComputeFn(linalgOp); if (!computeFn) return failure(); // All inputs should be constants. - int numInputs = genericOp.getNumDpsInputs(); + int numInputs = linalgOp.getNumDpsInputs(); SmallVector inputValues(numInputs); - for (const auto &en : llvm::enumerate(genericOp.getDpsInputOperands())) { + for (const auto &en : llvm::enumerate(linalgOp.getDpsInputOperands())) { if (!matchPattern(en.value()->get(), m_Constant(&inputValues[en.index()]))) return failure(); @@ -124,12 +125,11 @@ class FoldConstantBase : public OpRewritePattern { // Identified this as a potential candidate for folding. Now check the // policy to see whether we are allowed to proceed. - for (OpOperand *operand : genericOp.getDpsInputOperands()) { + for (OpOperand *operand : linalgOp.getDpsInputOperands()) { if (!controlFn(operand)) return failure(); } - auto linalgOp = cast(genericOp.getOperation()); SmallVector loopBounds = linalgOp.computeStaticLoopSizes(); int64_t numElements = outputType.getNumElements(); @@ -155,8 +155,8 @@ class FoldConstantBase : public OpRewritePattern { SmallVector> inputDims; for (int i = 0; i < numInputs; ++i) - inputDims.push_back(getDimPositions(genericOp.getIndexingMapsArray()[i])); - auto outputDims = getDimPositions(genericOp.getIndexingMapsArray().back()); + inputDims.push_back(getDimPositions(linalgOp.getIndexingMapsArray()[i])); + auto outputDims = getDimPositions(linalgOp.getIndexingMapsArray().back()); auto outputShape = outputType.getShape(); // Allocate small vectors for index delinearization. Initial values do not @@ -173,7 +173,7 @@ class FoldConstantBase : public OpRewritePattern { APIntOrFloatArray computeFnInputs; auto inputShapes = llvm::to_vector<4>( - llvm::map_range(genericOp.getInputs(), [](Value value) { + llvm::map_range(linalgOp.getDpsInputs(), [](Value value) { return cast(value.getType()).getShape(); })); @@ -254,7 +254,7 @@ class FoldConstantBase : public OpRewritePattern { isFloat ? DenseElementsAttr::get(outputType, fpOutputValues) : DenseElementsAttr::get(outputType, intOutputValues); - rewriter.replaceOpWithNewOp(genericOp, outputAttr); + rewriter.replaceOpWithNewOp(linalgOp, outputAttr); return success(); } @@ -262,18 +262,20 @@ class FoldConstantBase : public OpRewritePattern { ControlFusionFn controlFn; }; -// Folds linalg.generic ops that are actually transposes on constant values. +// Folds linalg.transpose (and linalg.generic ops that are actually transposes) +// on constant values. struct FoldConstantTranspose : public FoldConstantBase { + using FoldConstantBase::FoldConstantBase; - bool matchIndexingMaps(GenericOp genericOp) const { + bool matchIndexingMaps(LinalgOp linalgOp) const { // We should have one input and one output. - return genericOp.getIndexingMapsArray().size() == 2; + return linalgOp.getIndexingMapsArray().size() == 2; } - RegionComputationFn getRegionComputeFn(GenericOp genericOp) const { + RegionComputationFn getRegionComputeFn(LinalgOp linalgOp) const { // Make sure the region only contains a yield op. - Block &body = genericOp.getRegion().front(); + Block &body = linalgOp->getRegion(0).front(); if (!llvm::hasSingleElement(body)) return nullptr; auto yieldOp = dyn_cast(body.getTerminator()); diff --git a/mlir/test/Dialect/Linalg/constant-fold.mlir b/mlir/test/Dialect/Linalg/constant-fold.mlir new file mode 100644 index 00000000000000..3929c26a3382f4 --- /dev/null +++ b/mlir/test/Dialect/Linalg/constant-fold.mlir @@ -0,0 +1,148 @@ +// RUN: mlir-opt %s -linalg-fuse-elementwise-ops -split-input-file | FileCheck %s + +// CHECK-LABEL: @transpose_fold_2d_fp32 +func.func @transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> { + %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32> + // CHECK: %[[CST:.+]] = arith.constant + // CHECK-SAME{LITERAL}: dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32> + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"] + } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) { + ^bb0(%arg1: f32, %arg2: f32): + linalg.yield %arg1 : f32 + } -> tensor<3x2xf32> + // CHECK: return %[[CST]] + return %1 : tensor<3x2xf32> +} + +// ----- + +// CHECK-LABEL: @transpose_fold_2d_fp64 +func.func @transpose_fold_2d_fp64(%init: tensor<3x2xf64>) -> tensor<3x2xf64> { + %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf64> + // CHECK: %[[CST:.+]] = arith.constant + // CHECK-SAME{LITERAL}: dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf64> + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"] + } ins(%input : tensor<2x3xf64>) outs(%init : tensor<3x2xf64>) { + ^bb0(%arg1: f64, %arg2: f64): + linalg.yield %arg1 : f64 + } -> tensor<3x2xf64> + // CHECK: return %[[CST]] + return %1 : tensor<3x2xf64> +} + +// ----- + +// CHECK-LABEL: @transpose_fold_4d_i32 +func.func @transpose_fold_4d_i32(%init: tensor<3x1x4x2xi32>) -> tensor<3x1x4x2xi32> { + %input = arith.constant dense<[[ + [[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]], + [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]] + ]]> : tensor<1x2x3x4xi32> + // CHECK: %[[CST:.+]] = arith.constant dense<[ + // CHECK-SAME{LITERAL}: [[[0, 12], [1, 13], [2, 14], [3, 15]]], + // CHECK-SAME{LITERAL}: [[[4, 16], [5, 17], [6, 18], [7, 19]]], + // CHECK-SAME{LITERAL}: [[[8, 20], [9, 21], [10, 22], [11, 23]]] + // CHECK-SAME{LITERAL}: ]> + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], + iterator_types = ["parallel", "parallel", "parallel", "parallel"] + } ins(%input : tensor<1x2x3x4xi32>) outs(%init : tensor<3x1x4x2xi32>) { + ^bb0(%arg1: i32, %arg2: i32): + linalg.yield %arg1 : i32 + } -> tensor<3x1x4x2xi32> + // CHECK: return %[[CST]] + return %1 : tensor<3x1x4x2xi32> +} + +// ----- + +// CHECK-LABEL: @transpose_fold_4d_i16 +func.func @transpose_fold_4d_i16(%init: tensor<3x1x4x2xi16>) -> tensor<3x1x4x2xi16> { + %input = arith.constant dense<[[ + [[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]], + [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]] + ]]> : tensor<1x2x3x4xi16> + // CHECK: %[[CST:.+]] = arith.constant dense<[ + // CHECK-SAME{LITERAL}: [[[0, 12], [1, 13], [2, 14], [3, 15]]], + // CHECK-SAME{LITERAL}: [[[4, 16], [5, 17], [6, 18], [7, 19]]], + // CHECK-SAME{LITERAL}: [[[8, 20], [9, 21], [10, 22], [11, 23]]] + // CHECK-SAME{LITERAL}: ]> + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], + iterator_types = ["parallel", "parallel", "parallel", "parallel"] + } ins(%input : tensor<1x2x3x4xi16>) outs(%init : tensor<3x1x4x2xi16>) { + ^bb0(%arg1: i16, %arg2: i16): + linalg.yield %arg1 : i16 + } -> tensor<3x1x4x2xi16> + // CHECK: return %[[CST]] + return %1 : tensor<3x1x4x2xi16> +} + +// ----- + +// CHECK-LABEL: @transpose_nofold_non_cst_input +func.func @transpose_nofold_non_cst_input(%input: tensor<2x3xf32>, %init: tensor<3x2xf32>) -> tensor<3x2xf32> { + // CHECK: linalg.generic + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"] + } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) { + ^bb0(%arg1: f32, %arg2: f32): + linalg.yield %arg1 : f32 + } -> tensor<3x2xf32> + return %1 : tensor<3x2xf32> +} + +// ----- + +// CHECK-LABEL: @transpose_nofold_yield_const +func.func @transpose_nofold_yield_const(%init: tensor<3x2xf32>) -> tensor<3x2xf32> { + %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32> + %cst = arith.constant 8.0 : f32 + // CHECK: linalg.generic + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"] + } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) { + ^bb0(%arg1: f32, %arg2: f32): + linalg.yield %cst : f32 + } -> tensor<3x2xf32> + return %1 : tensor<3x2xf32> +} + +// ----- + +// CHECK-LABEL: @transpose_nofold_multi_ops_in_region +func.func @transpose_nofold_multi_ops_in_region(%init: tensor<3x2xf32>) -> tensor<3x2xf32> { + %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32> + // CHECK: linalg.generic + %1 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"] + } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) { + ^bb0(%arg1: f32, %arg2: f32): + %add = arith.addf %arg1, %arg1 : f32 + linalg.yield %add : f32 + } -> tensor<3x2xf32> + return %1 : tensor<3x2xf32> +} + +// ----- + +// CHECK-LABEL: @named_transpose_fold_2d_fp32 +func.func @named_transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> { + %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32> + // CHECK: %[[CST:.+]] = arith.constant + // CHECK-SAME{LITERAL}: dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32> + %1 = linalg.transpose ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) permutation = [1, 0] + // CHECK: return %[[CST]] + return %1 : tensor<3x2xf32> +} + +// ----- + + diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir index 15a4f6cdd3bbe4..e45a9fbb1052c1 100644 --- a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir +++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir @@ -777,139 +777,6 @@ func.func @fuse_scalar_constant(%arg0 : tensor) -> (tensor, te // ----- -// CHECK-LABEL: @transpose_fold_2d_fp32 -func.func @transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> { - %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32> - // CHECK: %[[CST:.+]] = arith.constant - // CHECK-SAME{LITERAL}: dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32> - %1 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"] - } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) { - ^bb0(%arg1: f32, %arg2: f32): - linalg.yield %arg1 : f32 - } -> tensor<3x2xf32> - // CHECK: return %[[CST]] - return %1 : tensor<3x2xf32> -} - -// ----- - -// CHECK-LABEL: @transpose_fold_2d_fp64 -func.func @transpose_fold_2d_fp64(%init: tensor<3x2xf64>) -> tensor<3x2xf64> { - %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf64> - // CHECK: %[[CST:.+]] = arith.constant - // CHECK-SAME{LITERAL}: dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf64> - %1 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"] - } ins(%input : tensor<2x3xf64>) outs(%init : tensor<3x2xf64>) { - ^bb0(%arg1: f64, %arg2: f64): - linalg.yield %arg1 : f64 - } -> tensor<3x2xf64> - // CHECK: return %[[CST]] - return %1 : tensor<3x2xf64> -} - -// ----- - -// CHECK-LABEL: @transpose_fold_4d_i32 -func.func @transpose_fold_4d_i32(%init: tensor<3x1x4x2xi32>) -> tensor<3x1x4x2xi32> { - %input = arith.constant dense<[[ - [[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]], - [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]] - ]]> : tensor<1x2x3x4xi32> - // CHECK: %[[CST:.+]] = arith.constant dense<[ - // CHECK-SAME{LITERAL}: [[[0, 12], [1, 13], [2, 14], [3, 15]]], - // CHECK-SAME{LITERAL}: [[[4, 16], [5, 17], [6, 18], [7, 19]]], - // CHECK-SAME{LITERAL}: [[[8, 20], [9, 21], [10, 22], [11, 23]]] - // CHECK-SAME{LITERAL}: ]> - %1 = linalg.generic { - indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], - iterator_types = ["parallel", "parallel", "parallel", "parallel"] - } ins(%input : tensor<1x2x3x4xi32>) outs(%init : tensor<3x1x4x2xi32>) { - ^bb0(%arg1: i32, %arg2: i32): - linalg.yield %arg1 : i32 - } -> tensor<3x1x4x2xi32> - // CHECK: return %[[CST]] - return %1 : tensor<3x1x4x2xi32> -} - -// ----- - -// CHECK-LABEL: @transpose_fold_4d_i16 -func.func @transpose_fold_4d_i16(%init: tensor<3x1x4x2xi16>) -> tensor<3x1x4x2xi16> { - %input = arith.constant dense<[[ - [[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]], - [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]] - ]]> : tensor<1x2x3x4xi16> - // CHECK: %[[CST:.+]] = arith.constant dense<[ - // CHECK-SAME{LITERAL}: [[[0, 12], [1, 13], [2, 14], [3, 15]]], - // CHECK-SAME{LITERAL}: [[[4, 16], [5, 17], [6, 18], [7, 19]]], - // CHECK-SAME{LITERAL}: [[[8, 20], [9, 21], [10, 22], [11, 23]]] - // CHECK-SAME{LITERAL}: ]> - %1 = linalg.generic { - indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], - iterator_types = ["parallel", "parallel", "parallel", "parallel"] - } ins(%input : tensor<1x2x3x4xi16>) outs(%init : tensor<3x1x4x2xi16>) { - ^bb0(%arg1: i16, %arg2: i16): - linalg.yield %arg1 : i16 - } -> tensor<3x1x4x2xi16> - // CHECK: return %[[CST]] - return %1 : tensor<3x1x4x2xi16> -} - -// ----- - -// CHECK-LABEL: @transpose_nofold_non_cst_input -func.func @transpose_nofold_non_cst_input(%input: tensor<2x3xf32>, %init: tensor<3x2xf32>) -> tensor<3x2xf32> { - // CHECK: linalg.generic - %1 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"] - } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) { - ^bb0(%arg1: f32, %arg2: f32): - linalg.yield %arg1 : f32 - } -> tensor<3x2xf32> - return %1 : tensor<3x2xf32> -} - -// ----- - -// CHECK-LABEL: @transpose_nofold_yield_const -func.func @transpose_nofold_yield_const(%init: tensor<3x2xf32>) -> tensor<3x2xf32> { - %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32> - %cst = arith.constant 8.0 : f32 - // CHECK: linalg.generic - %1 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"] - } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) { - ^bb0(%arg1: f32, %arg2: f32): - linalg.yield %cst : f32 - } -> tensor<3x2xf32> - return %1 : tensor<3x2xf32> -} - -// ----- - -// CHECK-LABEL: @transpose_nofold_multi_ops_in_region -func.func @transpose_nofold_multi_ops_in_region(%init: tensor<3x2xf32>) -> tensor<3x2xf32> { - %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32> - // CHECK: linalg.generic - %1 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"] - } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) { - ^bb0(%arg1: f32, %arg2: f32): - %add = arith.addf %arg1, %arg1 : f32 - linalg.yield %add : f32 - } -> tensor<3x2xf32> - return %1 : tensor<3x2xf32> -} - -// ----- - // Fusing the broadcast into a reduction would require to insert extra knowledge // about the size of the reduction dimension. As long, as this is not // implemented, we check that two linalg operations remain. From cde1ae4c14eecd47215f04d4387845231021d939 Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Tue, 28 May 2024 11:11:55 -0400 Subject: [PATCH 19/89] [lldb][NativePDB] Fix uninitialized values found by msan. --- .../source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp index fab3ca989c0ec6..17c5f6118603f4 100644 --- a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp +++ b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp @@ -47,15 +47,18 @@ UdtRecordCompleter::UdtRecordCompleter( CVType cvt = m_index.tpi().getType(m_id.index); switch (cvt.kind()) { case LF_ENUM: + m_cvr.er.Options = ClassOptions::None; llvm::cantFail(TypeDeserializer::deserializeAs(cvt, m_cvr.er)); break; case LF_UNION: + m_cvr.ur.Options = ClassOptions::None; llvm::cantFail(TypeDeserializer::deserializeAs(cvt, m_cvr.ur)); m_layout.bit_size = m_cvr.ur.getSize() * 8; m_record.record.kind = Member::Union; break; case LF_CLASS: case LF_STRUCTURE: + m_cvr.cr.Options = ClassOptions::None; llvm::cantFail(TypeDeserializer::deserializeAs(cvt, m_cvr.cr)); m_layout.bit_size = m_cvr.cr.getSize() * 8; m_record.record.kind = Member::Struct; From 94be801879788399a7ffa8c7cbe28f6c86e26ffe Mon Sep 17 00:00:00 2001 From: stefankoncarevic Date: Tue, 28 May 2024 17:17:02 +0200 Subject: [PATCH 20/89] [mlir][ROCDL] Update the LLVM data layout for ROCDL lowering. (#92127) This change updates the dataLayout string to ensure alignment with the latest LLVM TargetMachine configuration. The aim is to maintain consistency and prevent potential compilation issues related to memory address space handling. --- mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 6 +++--- mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index f425b1f59d9940..70dcccf0a7307a 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -77,9 +77,9 @@ Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, } static constexpr StringLiteral amdgcnDataLayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" - "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:" - "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-" - "G1-ni:7:8"; + "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:" + "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:" + "64-S32-A5-G1-ni:7:8:9"; namespace { struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 8a2d8bd7967caf..a8d61a6a0f6fd9 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -2,7 +2,8 @@ // RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s // CHECK-LABEL: @test_module -// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" + gpu.module @test_module { // CHECK-LABEL: func @gpu_index_ops() // CHECK32-LABEL: func @gpu_index_ops() From 26e0ce0b3633c67e09d2f3a99e0d4058a4e0a887 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Tue, 28 May 2024 17:32:27 +0200 Subject: [PATCH 21/89] [flang] update fir.box_rank and fir.is_array codegen (#93541) fir.box_rank codegen was invalid, it was assuming the rank field in the descriptor was an i32. This is not correct. Do not hard code the type, use the named position to find the type, and convert as needed in the patterns. --- flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h | 4 ++++ flang/lib/Optimizer/CodeGen/CodeGen.cpp | 9 ++++----- flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp | 8 ++++++++ flang/test/Fir/convert-to-llvm.fir | 9 +++++---- flang/test/Fir/tbaa.fir | 11 ++++++----- 5 files changed, 27 insertions(+), 14 deletions(-) diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h index 06a44f1885656f..510ff729989145 100644 --- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h +++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h @@ -101,6 +101,10 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern { mlir::Value box, mlir::ConversionPatternRewriter &rewriter) const; + mlir::Value getRankFromBox(mlir::Location loc, TypePair boxTy, + mlir::Value box, + mlir::ConversionPatternRewriter &rewriter) const; + // Get the element type given an LLVM type that is of the form // (array|struct|vector)+ and the provided indexes. mlir::Type getBoxEleTy(mlir::Type type, diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 74e68725003cb9..664453ebaf2f74 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -391,9 +391,8 @@ struct BoxIsArrayOpConversion : public fir::FIROpConversion { mlir::Value a = adaptor.getOperands()[0]; auto loc = boxisarray.getLoc(); TypePair boxTyPair = getBoxTypePair(boxisarray.getVal().getType()); - auto rank = getValueFromBox(loc, boxTyPair, a, rewriter.getI32Type(), - rewriter, kRankPosInBox); - auto c0 = genConstantOffset(loc, rewriter, 0); + mlir::Value rank = getRankFromBox(loc, boxTyPair, a, rewriter); + mlir::Value c0 = genConstantIndex(loc, rank.getType(), rewriter, 0); rewriter.replaceOpWithNewOp( boxisarray, mlir::LLVM::ICmpPredicate::ne, rank, c0); return mlir::success(); @@ -430,8 +429,8 @@ struct BoxRankOpConversion : public fir::FIROpConversion { auto loc = boxrank.getLoc(); mlir::Type ty = convertType(boxrank.getType()); TypePair boxTyPair = getBoxTypePair(boxrank.getVal().getType()); - auto result = - getValueFromBox(loc, boxTyPair, a, ty, rewriter, kRankPosInBox); + mlir::Value rank = getRankFromBox(loc, boxTyPair, a, rewriter); + mlir::Value result = integerCast(loc, rewriter, ty, rank); rewriter.replaceOp(boxrank, result); return mlir::success(); } diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp index 69e78167b07333..8c726d547491a7 100644 --- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp +++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp @@ -179,6 +179,14 @@ mlir::Value ConvertFIRToLLVMPattern::getElementSizeFromBox( return getValueFromBox(loc, boxTy, box, resultTy, rewriter, kElemLenPosInBox); } +/// Read base address from a fir.box. Returned address has type ty. +mlir::Value ConvertFIRToLLVMPattern::getRankFromBox( + mlir::Location loc, TypePair boxTy, mlir::Value box, + mlir::ConversionPatternRewriter &rewriter) const { + mlir::Type resultTy = getBoxEleTy(boxTy.llvm, {kRankPosInBox}); + return getValueFromBox(loc, boxTy, box, resultTy, rewriter, kRankPosInBox); +} + // Get the element type given an LLVM type that is of the form // (array|struct|vector)+ and the provided indexes. mlir::Type ConvertFIRToLLVMPattern::getBoxEleTy( diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index 21323a5e657c94..70cb0443e9a645 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -941,7 +941,8 @@ func.func @extract_rank(%arg0: !fir.box>) -> i32 { // CHECK-LABEL: llvm.func @extract_rank( // CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr) -> i32 // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})> -// CHECK: %[[RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i32 +// CHECK: %[[RAW_RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i8 +// CHECK: %[[RANK:.*]] = llvm.sext %[[RAW_RANK]] : i8 to i32 // CHECK: llvm.return %[[RANK]] : i32 // ----- @@ -1009,9 +1010,9 @@ func.func @box_isarray(%arg0: !fir.box>) -> i1 { // CHECK-LABEL: llvm.func @box_isarray( // CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr) -> i1 // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})> -// CHECK: %[[RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i32 -// CHECK: %[[C0_ISARRAY:.*]] = llvm.mlir.constant(0 : i32) : i32 -// CHECK: %[[IS_ARRAY:.*]] = llvm.icmp "ne" %[[RANK]], %[[C0_ISARRAY]] : i32 +// CHECK: %[[RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i8 +// CHECK: %[[C0_ISARRAY:.*]] = llvm.mlir.constant(0 : i64) : i8 +// CHECK: %[[IS_ARRAY:.*]] = llvm.icmp "ne" %[[RANK]], %[[C0_ISARRAY]] : i8 // CHECK: llvm.return %[[IS_ARRAY]] : i1 // ----- diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir index 048f53f5c6e47a..f4f23d35cba257 100644 --- a/flang/test/Fir/tbaa.fir +++ b/flang/test/Fir/tbaa.fir @@ -248,8 +248,9 @@ func.func @tbaa(%arg0: !fir.box>) -> i32 { // CHECK-LABEL: llvm.func @tbaa( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr) -> i32 { // CHECK: %[[VAL_1:.*]] = llvm.getelementptr %[[VAL_0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> -// CHECK: %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i32 -// CHECK: llvm.return %[[VAL_2]] : i32 +// CHECK: %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i8 +// CHECK: %[[VAL_3:.*]] = llvm.sext %[[VAL_2]] : i8 to i32 +// CHECK: llvm.return %[[VAL_3]] : i32 // CHECK: } // ----- @@ -267,9 +268,9 @@ func.func @tbaa(%arg0: !fir.box>) -> i1 { // CHECK-LABEL: llvm.func @tbaa( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr) -> i1 { // CHECK: %[[VAL_1:.*]] = llvm.getelementptr %[[VAL_0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> -// CHECK: %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i32 -// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(0 : i32) : i32 -// CHECK: %[[VAL_4:.*]] = llvm.icmp "ne" %[[VAL_2]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i8 +// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(0 : i64) : i8 +// CHECK: %[[VAL_4:.*]] = llvm.icmp "ne" %[[VAL_2]], %[[VAL_3]] : i8 // CHECK: llvm.return %[[VAL_4]] : i1 // CHECK: } From 88902147c11f8de5cc7c792fd8c476a821664297 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Tue, 28 May 2024 10:21:40 -0500 Subject: [PATCH 22/89] [Frontend][OpenMP] Rename some variables, NFC Rename things in a couple of places to make the code a bit clearer. --- .../llvm/Frontend/OpenMP/ConstructDecompositionT.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h index 3fa27608ead948..3feb4bd11c998f 100644 --- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h +++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h @@ -371,9 +371,8 @@ ConstructDecompositionT::addClauseSymsToMap(U &&item, // anything and return false, otherwise return true. template bool ConstructDecompositionT::applyToUnique(const ClauseTy *node) { - auto unique = detail::find_unique(leafs, [=](const auto &dirInfo) { - return llvm::omp::isAllowedClauseForDirective(dirInfo.id, node->id, - version); + auto unique = detail::find_unique(leafs, [=](const auto &leaf) { + return llvm::omp::isAllowedClauseForDirective(leaf.id, node->id, version); }); if (unique != leafs.end()) { @@ -438,8 +437,8 @@ bool ConstructDecompositionT::applyToAll(const ClauseTy *node) { } template -template -bool ConstructDecompositionT::applyClause(Clause &&clause, +template +bool ConstructDecompositionT::applyClause(Specific &&specific, const ClauseTy *node) { // The default behavior is to find the unique directive to which the // given clause may be applied. If there are no such directives, or From 51dd4eaaa29683c16151f5168e7f8645acbd6e6c Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Tue, 28 May 2024 11:49:07 -0400 Subject: [PATCH 23/89] Reapply [lldb][DWARF] Delay struct/class/union definition DIE searching when parsing declaration DIEs. (#92328) This reapplies https://github.com/llvm/llvm-project/commit/9a7262c2601874e5aa64c5db19746770212d4b44 (#90663) and added https://github.com/llvm/llvm-project/pull/91808 as a fix. It was causing tests on macos to fail because `SymbolFileDWARF::GetForwardDeclCompilerTypeToDIE` returned the map owned by this symol file. When there were two symbol files, two different maps were created for caching from compiler type to DIE even if they are for the same module. The solution is to do the same as `SymbolFileDWARF::GetUniqueDWARFASTTypeMap`: inquery SymbolFileDWARFDebugMap first to get the shared underlying SymbolFile so the map is shared among multiple SymbolFileDWARF. --- .../Plugins/SymbolFile/DWARF/DWARFASTParser.h | 2 + .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 397 ++++++++++-------- .../SymbolFile/DWARF/DWARFASTParserClang.h | 197 ++++----- .../SymbolFile/DWARF/DebugNamesDWARFIndex.cpp | 4 + .../SymbolFile/DWARF/SymbolFileDWARF.cpp | 51 ++- .../SymbolFile/DWARF/SymbolFileDWARF.h | 15 +- .../DWARF/SymbolFileDWARFDebugMap.h | 9 + .../SymbolFile/DWARF/SymbolFileDWARFDwo.cpp | 2 +- .../SymbolFile/DWARF/SymbolFileDWARFDwo.h | 3 +- .../SymbolFile/DWARF/UniqueDWARFASTType.cpp | 107 ++--- .../SymbolFile/DWARF/UniqueDWARFASTType.h | 36 +- .../delayed-definition-die-searching.test | 36 ++ 12 files changed, 467 insertions(+), 392 deletions(-) create mode 100644 lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h index 66db396279e063..e144cf0f9bd94e 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h @@ -60,6 +60,8 @@ class DWARFASTParser { virtual ConstString GetDIEClassTemplateParams(const DWARFDIE &die) = 0; + virtual lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) = 0; + static std::optional ParseChildArrayInfo(const DWARFDIE &parent_die, const ExecutionContext *exe_ctx = nullptr); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index f8101aba5c6277..e0b1b430b266f3 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -154,6 +154,26 @@ static bool TagIsRecordType(dw_tag_t tag) { } } +static bool IsForwardDeclaration(const DWARFDIE &die, + const ParsedDWARFTypeAttributes &attrs, + LanguageType cu_language) { + if (attrs.is_forward_declaration) + return true; + + // Work around an issue with clang at the moment where forward + // declarations for objective C classes are emitted as: + // DW_TAG_structure_type [2] + // DW_AT_name( "ForwardObjcClass" ) + // DW_AT_byte_size( 0x00 ) + // DW_AT_decl_file( "..." ) + // DW_AT_decl_line( 1 ) + // + // Note that there is no DW_AT_declaration and there are no children, + // and the byte size is zero. + return attrs.byte_size && *attrs.byte_size == 0 && attrs.name && + !die.HasChildren() && cu_language == eLanguageTypeObjC; +} + TypeSP DWARFASTParserClang::ParseTypeFromClangModule(const SymbolContext &sc, const DWARFDIE &die, Log *log) { @@ -249,11 +269,9 @@ static void ForcefullyCompleteType(CompilerType type) { /// This function serves a similar purpose as RequireCompleteType above, but it /// avoids completing the type if it is not immediately necessary. It only /// ensures we _can_ complete the type later. -static void PrepareContextToReceiveMembers(TypeSystemClang &ast, - ClangASTImporter &ast_importer, - clang::DeclContext *decl_ctx, - DWARFDIE die, - const char *type_name_cstr) { +void DWARFASTParserClang::PrepareContextToReceiveMembers( + clang::DeclContext *decl_ctx, const DWARFDIE &decl_ctx_die, + const DWARFDIE &die, const char *type_name_cstr) { auto *tag_decl_ctx = clang::dyn_cast(decl_ctx); if (!tag_decl_ctx) return; // Non-tag context are always ready. @@ -268,7 +286,8 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast, // gmodules case), we can complete the type by doing a full import. // If this type was not imported from an external AST, there's nothing to do. - CompilerType type = ast.GetTypeForDecl(tag_decl_ctx); + CompilerType type = m_ast.GetTypeForDecl(tag_decl_ctx); + ClangASTImporter &ast_importer = GetClangASTImporter(); if (type && ast_importer.CanImport(type)) { auto qual_type = ClangUtil::GetQualType(type); if (ast_importer.RequireCompleteType(qual_type)) @@ -279,6 +298,13 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast, type_name_cstr ? type_name_cstr : "", die.GetOffset()); } + // By searching for the definition DIE of the decl_ctx type, we will either: + // 1. Found the the definition DIE and start its definition with + // TypeSystemClang::StartTagDeclarationDefinition. + // 2. Unable to find it, then need to forcefully complete it. + FindDefinitionTypeForDIE(decl_ctx_die); + if (tag_decl_ctx->isCompleteDefinition() || tag_decl_ctx->isBeingDefined()) + return; // We don't have a type definition and/or the import failed. We must // forcefully complete the type to avoid crashes. ForcefullyCompleteType(type); @@ -620,10 +646,11 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc, if (tag == DW_TAG_typedef) { // DeclContext will be populated when the clang type is materialized in // Type::ResolveCompilerType. - PrepareContextToReceiveMembers( - m_ast, GetClangASTImporter(), - GetClangDeclContextContainingDIE(die, nullptr), die, - attrs.name.GetCString()); + DWARFDIE decl_ctx_die; + clang::DeclContext *decl_ctx = + GetClangDeclContextContainingDIE(die, &decl_ctx_die); + PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die, + attrs.name.GetCString()); if (attrs.type.IsValid()) { // Try to parse a typedef from the (DWARF embedded in the) Clang @@ -1103,32 +1130,6 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, // struct and see if this is actually a C++ method Type *class_type = dwarf->ResolveType(decl_ctx_die); if (class_type) { - if (class_type->GetID() != decl_ctx_die.GetID() || - IsClangModuleFwdDecl(decl_ctx_die)) { - - // We uniqued the parent class of this function to another - // class so we now need to associate all dies under - // "decl_ctx_die" to DIEs in the DIE for "class_type"... - DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID()); - - if (class_type_die) { - std::vector failures; - - CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die, - class_type, failures); - - // FIXME do something with these failures that's - // smarter than just dropping them on the ground. - // Unfortunately classes don't like having stuff added - // to them after their definitions are complete... - - Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()]; - if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) { - return type_ptr->shared_from_this(); - } - } - } - if (attrs.specification.IsValid()) { // We have a specification which we are going to base our // function prototype off of, so we need this type to be @@ -1263,6 +1264,39 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, } } } + // By here, we should have already completed the c++ class_type + // because if either specification or abstract_origin is present, we + // call GetClangDeclContextForDIE to resolve the DW_TAG_subprogram + // refered by this one until we reached the DW_TAG_subprogram without + // specification or abstract_origin (the else branch above). Then the + // above GetFullCompilerType() will complete the class_type if it's + // not completed yet. After that, we will have the mapping from DIEs + // in class_type_die to DeclContexts in m_die_to_decl_ctx. + if (class_type->GetID() != decl_ctx_die.GetID() || + IsClangModuleFwdDecl(decl_ctx_die)) { + + // We uniqued the parent class of this function to another + // class so we now need to associate all dies under + // "decl_ctx_die" to DIEs in the DIE for "class_type"... + DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID()); + + if (class_type_die) { + std::vector failures; + + CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die, + class_type, failures); + + // FIXME do something with these failures that's + // smarter than just dropping them on the ground. + // Unfortunately classes don't like having stuff added + // to them after their definitions are complete... + + Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()]; + if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) { + return type_ptr->shared_from_this(); + } + } + } } } } @@ -1635,6 +1669,93 @@ DWARFASTParserClang::GetCPlusPlusQualifiedName(const DWARFDIE &die) { return qualified_name; } +lldb_private::Type * +DWARFASTParserClang::FindDefinitionTypeForDIE(const DWARFDIE &die) { + SymbolFileDWARF *dwarf = die.GetDWARF(); + ParsedDWARFTypeAttributes attrs(die); + bool is_forward_declaration = IsForwardDeclaration( + die, attrs, SymbolFileDWARF::GetLanguage(*die.GetCU())); + if (!is_forward_declaration) + return dwarf->GetDIEToType()[die.GetDIE()]; + + const dw_tag_t tag = die.Tag(); + TypeSP type_sp; + Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups); + if (log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, + "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a " + "forward declaration DIE, trying to find definition DIE", + static_cast(this), die.GetOffset(), DW_TAG_value_to_name(tag), + attrs.name.GetCString()); + } + // We haven't parse definition die for this type, starting to search for it. + // After we found the definition die, the GetDeclarationDIEToDefinitionDIE() + // map will have the new mapping from this declaration die to definition die. + if (attrs.class_language == eLanguageTypeObjC || + attrs.class_language == eLanguageTypeObjC_plus_plus) { + if (!attrs.is_complete_objc_class && + die.Supports_DW_AT_APPLE_objc_complete_type()) { + // We have a valid eSymbolTypeObjCClass class symbol whose name + // matches the current objective C class that we are trying to find + // and this DIE isn't the complete definition (we checked + // is_complete_objc_class above and know it is false), so the real + // definition is in here somewhere + type_sp = + dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true); + + if (!type_sp) { + SymbolFileDWARFDebugMap *debug_map_symfile = + dwarf->GetDebugMapSymfile(); + if (debug_map_symfile) { + // We weren't able to find a full declaration in this DWARF, + // see if we have a declaration anywhere else... + type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE( + die, attrs.name, true); + } + } + + if (type_sp && log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, + "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an " + "incomplete objc type, complete type is {5:x8}", + static_cast(this), die.GetOffset(), + DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(), + type_sp->GetID()); + } + } + } + + type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die); + if (!type_sp) { + SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile(); + if (debug_map_symfile) { + // We weren't able to find a full declaration in this DWARF, see + // if we have a declaration anywhere else... + type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die); + } + if (type_sp && log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, + "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a " + "forward declaration, complete type is {4:x8}", + static_cast(this), die.GetOffset(), DW_TAG_value_to_name(tag), + attrs.name.GetCString(), type_sp->GetID()); + } + } + + if (!type_sp && log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, + "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a " + "forward declaration, unable to find definition DIE for it", + static_cast(this), die.GetOffset(), DW_TAG_value_to_name(tag), + attrs.name.GetCString()); + } + return type_sp.get(); +} + TypeSP DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, const DWARFDIE &die, @@ -1646,14 +1767,10 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, LanguageType cu_language = SymbolFileDWARF::GetLanguage(*die.GetCU()); Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups); - // UniqueDWARFASTType is large, so don't create a local variables on the - // stack, put it on the heap. This function is often called recursively and - // clang isn't good at sharing the stack space for variables in different - // blocks. - auto unique_ast_entry_up = std::make_unique(); - ConstString unique_typename(attrs.name); Declaration unique_decl(attrs.decl); + uint64_t byte_size = attrs.byte_size.value_or(0); + attrs.is_forward_declaration = IsForwardDeclaration(die, attrs, cu_language); if (attrs.name) { if (Language::LanguageIsCPlusPlus(cu_language)) { @@ -1666,14 +1783,42 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, unique_decl.Clear(); } - if (dwarf->GetUniqueDWARFASTTypeMap().Find( - unique_typename, die, unique_decl, attrs.byte_size.value_or(-1), - *unique_ast_entry_up)) { - type_sp = unique_ast_entry_up->m_type_sp; + if (UniqueDWARFASTType *unique_ast_entry_type = + dwarf->GetUniqueDWARFASTTypeMap().Find( + unique_typename, die, unique_decl, byte_size, + attrs.is_forward_declaration)) { + type_sp = unique_ast_entry_type->m_type_sp; if (type_sp) { dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get(); LinkDeclContextToDIE( - GetCachedClangDeclContextForDIE(unique_ast_entry_up->m_die), die); + GetCachedClangDeclContextForDIE(unique_ast_entry_type->m_die), die); + if (!attrs.is_forward_declaration) { + // If the DIE being parsed in this function is a definition and the + // entry in the map is a declaration, then we need to update the entry + // to point to the definition DIE. + if (unique_ast_entry_type->m_is_forward_declaration) { + unique_ast_entry_type->m_die = die; + unique_ast_entry_type->m_byte_size = byte_size; + unique_ast_entry_type->m_declaration = unique_decl; + unique_ast_entry_type->m_is_forward_declaration = false; + // Need to update Type ID to refer to the definition DIE. because + // it's used in ParseSubroutine to determine if we need to copy cxx + // method types from a declaration DIE to this definition DIE. + type_sp->SetID(die.GetID()); + clang_type = type_sp->GetForwardCompilerType(); + if (attrs.class_language != eLanguageTypeObjC && + attrs.class_language != eLanguageTypeObjC_plus_plus) + TypeSystemClang::StartTagDeclarationDefinition(clang_type); + + CompilerType compiler_type_no_qualifiers = + ClangUtil::RemoveFastQualifiers(clang_type); + auto result = dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace( + compiler_type_no_qualifiers.GetOpaqueQualType(), + *die.GetDIERef()); + if (!result.second) + result.first->second = *die.GetDIERef(); + } + } return type_sp; } } @@ -1695,125 +1840,21 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, default_accessibility = eAccessPrivate; } - if (attrs.byte_size && *attrs.byte_size == 0 && attrs.name && - !die.HasChildren() && cu_language == eLanguageTypeObjC) { - // Work around an issue with clang at the moment where forward - // declarations for objective C classes are emitted as: - // DW_TAG_structure_type [2] - // DW_AT_name( "ForwardObjcClass" ) - // DW_AT_byte_size( 0x00 ) - // DW_AT_decl_file( "..." ) - // DW_AT_decl_line( 1 ) - // - // Note that there is no DW_AT_declaration and there are no children, - // and the byte size is zero. - attrs.is_forward_declaration = true; - } - - if (attrs.class_language == eLanguageTypeObjC || - attrs.class_language == eLanguageTypeObjC_plus_plus) { - if (!attrs.is_complete_objc_class && - die.Supports_DW_AT_APPLE_objc_complete_type()) { - // We have a valid eSymbolTypeObjCClass class symbol whose name - // matches the current objective C class that we are trying to find - // and this DIE isn't the complete definition (we checked - // is_complete_objc_class above and know it is false), so the real - // definition is in here somewhere - type_sp = - dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true); - - if (!type_sp) { - SymbolFileDWARFDebugMap *debug_map_symfile = - dwarf->GetDebugMapSymfile(); - if (debug_map_symfile) { - // We weren't able to find a full declaration in this DWARF, - // see if we have a declaration anywhere else... - type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE( - die, attrs.name, true); - } - } - - if (type_sp) { - if (log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an " - "incomplete objc type, complete type is {5:x8}", - static_cast(this), die.GetOffset(), - DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(), - type_sp->GetID()); - } - - // We found a real definition for this type elsewhere so lets use - // it and cache the fact that we found a complete type for this - // die - dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get(); - return type_sp; - } - } - } - if (attrs.is_forward_declaration) { - // We have a forward declaration to a type and we need to try and - // find a full declaration. We look in the current type index just in - // case we have a forward declaration followed by an actual - // declarations in the DWARF. If this fails, we need to look - // elsewhere... - if (log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a " - "forward declaration, trying to find complete type", - static_cast(this), die.GetOffset(), DW_TAG_value_to_name(tag), - tag, attrs.name.GetCString()); - } - // See if the type comes from a Clang module and if so, track down // that type. type_sp = ParseTypeFromClangModule(sc, die, log); if (type_sp) return type_sp; - - // type_sp = FindDefinitionTypeForDIE (dwarf_cu, die, - // type_name_const_str); - type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die); - - if (!type_sp) { - SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile(); - if (debug_map_symfile) { - // We weren't able to find a full declaration in this DWARF, see - // if we have a declaration anywhere else... - type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die); - } - } - - if (type_sp) { - if (log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a " - "forward declaration, complete type is {5:x8}", - static_cast(this), die.GetOffset(), - DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(), - type_sp->GetID()); - } - - // We found a real definition for this type elsewhere so lets use - // it and cache the fact that we found a complete type for this die - dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get(); - clang::DeclContext *defn_decl_ctx = - GetCachedClangDeclContextForDIE(dwarf->GetDIE(type_sp->GetID())); - if (defn_decl_ctx) - LinkDeclContextToDIE(defn_decl_ctx, die); - return type_sp; - } } + assert(tag_decl_kind != -1); UNUSED_IF_ASSERT_DISABLED(tag_decl_kind); - bool clang_type_was_created = false; - clang::DeclContext *decl_ctx = GetClangDeclContextContainingDIE(die, nullptr); + DWARFDIE decl_ctx_die; + clang::DeclContext *decl_ctx = + GetClangDeclContextContainingDIE(die, &decl_ctx_die); - PrepareContextToReceiveMembers(m_ast, GetClangASTImporter(), decl_ctx, die, + PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die, attrs.name.GetCString()); if (attrs.accessibility == eAccessNone && decl_ctx) { @@ -1852,20 +1893,17 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, tag_decl_kind, template_param_infos); clang_type = m_ast.CreateClassTemplateSpecializationType(class_specialization_decl); - clang_type_was_created = true; m_ast.SetMetadata(class_template_decl, metadata); m_ast.SetMetadata(class_specialization_decl, metadata); } - if (!clang_type_was_created) { - clang_type_was_created = true; + if (!clang_type) { clang_type = m_ast.CreateRecordType( decl_ctx, GetOwningClangModule(die), attrs.accessibility, attrs.name.GetCString(), tag_decl_kind, attrs.class_language, &metadata, attrs.exports_symbols); } - // Store a forward declaration to this class type in case any // parameters in any class methods need it for the clang types for // function prototypes. @@ -1876,13 +1914,19 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, Type::ResolveState::Forward, TypePayloadClang(OptionalClangModuleID(), attrs.is_complete_objc_class)); + // UniqueDWARFASTType is large, so don't create a local variables on the + // stack, put it on the heap. This function is often called recursively and + // clang isn't good at sharing the stack space for variables in different + // blocks. + auto unique_ast_entry_up = std::make_unique(); // Add our type to the unique type map so we don't end up creating many // copies of the same type over and over in the ASTContext for our // module unique_ast_entry_up->m_type_sp = type_sp; unique_ast_entry_up->m_die = die; unique_ast_entry_up->m_declaration = unique_decl; - unique_ast_entry_up->m_byte_size = attrs.byte_size.value_or(0); + unique_ast_entry_up->m_byte_size = byte_size; + unique_ast_entry_up->m_is_forward_declaration = attrs.is_forward_declaration; dwarf->GetUniqueDWARFASTTypeMap().Insert(unique_typename, *unique_ast_entry_up); @@ -1923,7 +1967,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, GetClangASTImporter().SetRecordLayout(record_decl, layout); } } - } else if (clang_type_was_created) { + } else { // Start the definition if the class is not objective C since the // underlying decls respond to isCompleteDefinition(). Objective // C decls don't respond to isCompleteDefinition() so we can't @@ -1935,26 +1979,21 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, if (attrs.class_language != eLanguageTypeObjC && attrs.class_language != eLanguageTypeObjC_plus_plus) TypeSystemClang::StartTagDeclarationDefinition(clang_type); - - // Leave this as a forward declaration until we need to know the - // details of the type. lldb_private::Type will automatically call - // the SymbolFile virtual function - // "SymbolFileDWARF::CompleteType(Type *)" When the definition - // needs to be defined. - assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count( - ClangUtil::RemoveFastQualifiers(clang_type) - .GetOpaqueQualType()) && - "Type already in the forward declaration map!"); - // Can't assume m_ast.GetSymbolFile() is actually a - // SymbolFileDWARF, it can be a SymbolFileDWARFDebugMap for Apple - // binaries. - dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace( - ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(), - *die.GetDIERef()); - m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true); } } + // If this is a declaration DIE, leave this as a forward declaration until we + // need to know the details of the type. lldb_private::Type will automatically + // call the SymbolFile virtual function "SymbolFileDWARF::CompleteType(Type + // *)" When the definition needs to be defined. + assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count( + ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType()) && + "Type already in the forward declaration map!"); + dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace( + ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(), + *die.GetDIERef()); + m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true); + // If we made a clang type, set the trivial abi if applicable: We only // do this for pass by value - which implies the Trivial ABI. There // isn't a way to assert that something that would normally be pass by diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h index 8d4af203bb2871..853b8ccc30369f 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h @@ -42,40 +42,40 @@ struct ParsedDWARFTypeAttributes; class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { public: + typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE; + DWARFASTParserClang(lldb_private::TypeSystemClang &ast); ~DWARFASTParserClang() override; // DWARFASTParser interface. - lldb::TypeSP - ParseTypeFromDWARF(const lldb_private::SymbolContext &sc, - const lldb_private::plugin::dwarf::DWARFDIE &die, - bool *type_is_new_ptr) override; + lldb::TypeSP ParseTypeFromDWARF(const lldb_private::SymbolContext &sc, + const DWARFDIE &die, + bool *type_is_new_ptr) override; - lldb_private::ConstString ConstructDemangledNameFromDWARF( - const lldb_private::plugin::dwarf::DWARFDIE &die) override; + lldb_private::ConstString + ConstructDemangledNameFromDWARF(const DWARFDIE &die) override; lldb_private::Function * ParseFunctionFromDWARF(lldb_private::CompileUnit &comp_unit, - const lldb_private::plugin::dwarf::DWARFDIE &die, + const DWARFDIE &die, const lldb_private::AddressRange &func_range) override; bool - CompleteTypeFromDWARF(const lldb_private::plugin::dwarf::DWARFDIE &die, - lldb_private::Type *type, + CompleteTypeFromDWARF(const DWARFDIE &die, lldb_private::Type *type, lldb_private::CompilerType &compiler_type) override; - lldb_private::CompilerDecl GetDeclForUIDFromDWARF( - const lldb_private::plugin::dwarf::DWARFDIE &die) override; + lldb_private::CompilerDecl + GetDeclForUIDFromDWARF(const DWARFDIE &die) override; void EnsureAllDIEsInDeclContextHaveBeenParsed( lldb_private::CompilerDeclContext decl_context) override; - lldb_private::CompilerDeclContext GetDeclContextForUIDFromDWARF( - const lldb_private::plugin::dwarf::DWARFDIE &die) override; + lldb_private::CompilerDeclContext + GetDeclContextForUIDFromDWARF(const DWARFDIE &die) override; - lldb_private::CompilerDeclContext GetDeclContextContainingUIDFromDWARF( - const lldb_private::plugin::dwarf::DWARFDIE &die) override; + lldb_private::CompilerDeclContext + GetDeclContextContainingUIDFromDWARF(const DWARFDIE &die) override; lldb_private::ClangASTImporter &GetClangASTImporter(); @@ -105,8 +105,13 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { /// \return A string, including surrounding '<>', of the template parameters. /// If the DIE's name already has '<>', returns an empty ConstString because /// it's assumed that the caller is using the DIE name anyway. - lldb_private::ConstString GetDIEClassTemplateParams( - const lldb_private::plugin::dwarf::DWARFDIE &die) override; + lldb_private::ConstString + GetDIEClassTemplateParams(const DWARFDIE &die) override; + + // Searching for definition DIE for the given DIE and return the type + // associated with the definition DIE, or nullptr if definition DIE is not + // found. + lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) override; protected: /// Protected typedefs and members. @@ -118,8 +123,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *, clang::DeclContext *> DIEToDeclContextMap; - typedef std::multimap + typedef std::multimap DeclContextToDIEMap; typedef llvm::DenseMap< const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *, @@ -137,14 +141,11 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { std::unique_ptr m_clang_ast_importer_up; /// @} - clang::DeclContext * - GetDeclContextForBlock(const lldb_private::plugin::dwarf::DWARFDIE &die); + clang::DeclContext *GetDeclContextForBlock(const DWARFDIE &die); - clang::BlockDecl * - ResolveBlockDIE(const lldb_private::plugin::dwarf::DWARFDIE &die); + clang::BlockDecl *ResolveBlockDIE(const DWARFDIE &die); - clang::NamespaceDecl * - ResolveNamespaceDIE(const lldb_private::plugin::dwarf::DWARFDIE &die); + clang::NamespaceDecl *ResolveNamespaceDIE(const DWARFDIE &die); /// Returns the namespace decl that a DW_TAG_imported_declaration imports. /// @@ -155,96 +156,86 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { /// 'die' imports. If the imported entity is not a namespace /// or another import declaration, returns nullptr. If an error /// occurs, returns nullptr. - clang::NamespaceDecl *ResolveImportedDeclarationDIE( - const lldb_private::plugin::dwarf::DWARFDIE &die); + clang::NamespaceDecl *ResolveImportedDeclarationDIE(const DWARFDIE &die); - bool ParseTemplateDIE(const lldb_private::plugin::dwarf::DWARFDIE &die, + bool ParseTemplateDIE(const DWARFDIE &die, lldb_private::TypeSystemClang::TemplateParameterInfos &template_param_infos); bool ParseTemplateParameterInfos( - const lldb_private::plugin::dwarf::DWARFDIE &parent_die, + const DWARFDIE &parent_die, lldb_private::TypeSystemClang::TemplateParameterInfos &template_param_infos); - std::string - GetCPlusPlusQualifiedName(const lldb_private::plugin::dwarf::DWARFDIE &die); + std::string GetCPlusPlusQualifiedName(const DWARFDIE &die); bool ParseChildMembers( - const lldb_private::plugin::dwarf::DWARFDIE &die, - lldb_private::CompilerType &class_compiler_type, + const DWARFDIE &die, lldb_private::CompilerType &class_compiler_type, std::vector> &base_classes, - std::vector &member_function_dies, - std::vector &contained_type_dies, + std::vector &member_function_dies, + std::vector &contained_type_dies, DelayedPropertyList &delayed_properties, const lldb::AccessType default_accessibility, lldb_private::ClangASTImporter::LayoutInfo &layout_info); size_t ParseChildParameters(clang::DeclContext *containing_decl_ctx, - const lldb_private::plugin::dwarf::DWARFDIE &parent_die, - bool skip_artificial, bool &is_static, bool &is_variadic, + const DWARFDIE &parent_die, bool skip_artificial, + bool &is_static, bool &is_variadic, bool &has_template_params, std::vector &function_args, std::vector &function_param_decls, unsigned &type_quals); - size_t ParseChildEnumerators( - lldb_private::CompilerType &compiler_type, bool is_signed, - uint32_t enumerator_byte_size, - const lldb_private::plugin::dwarf::DWARFDIE &parent_die); + size_t ParseChildEnumerators(lldb_private::CompilerType &compiler_type, + bool is_signed, uint32_t enumerator_byte_size, + const DWARFDIE &parent_die); /// Parse a structure, class, or union type DIE. - lldb::TypeSP - ParseStructureLikeDIE(const lldb_private::SymbolContext &sc, - const lldb_private::plugin::dwarf::DWARFDIE &die, - ParsedDWARFTypeAttributes &attrs); + lldb::TypeSP ParseStructureLikeDIE(const lldb_private::SymbolContext &sc, + const DWARFDIE &die, + ParsedDWARFTypeAttributes &attrs); - clang::Decl * - GetClangDeclForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die); + clang::Decl *GetClangDeclForDIE(const DWARFDIE &die); - clang::DeclContext * - GetClangDeclContextForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die); + clang::DeclContext *GetClangDeclContextForDIE(const DWARFDIE &die); - clang::DeclContext *GetClangDeclContextContainingDIE( - const lldb_private::plugin::dwarf::DWARFDIE &die, - lldb_private::plugin::dwarf::DWARFDIE *decl_ctx_die); - lldb_private::OptionalClangModuleID - GetOwningClangModule(const lldb_private::plugin::dwarf::DWARFDIE &die); + clang::DeclContext *GetClangDeclContextContainingDIE(const DWARFDIE &die, + DWARFDIE *decl_ctx_die); + lldb_private::OptionalClangModuleID GetOwningClangModule(const DWARFDIE &die); - bool CopyUniqueClassMethodTypes( - const lldb_private::plugin::dwarf::DWARFDIE &src_class_die, - const lldb_private::plugin::dwarf::DWARFDIE &dst_class_die, - lldb_private::Type *class_type, - std::vector &failures); + bool CopyUniqueClassMethodTypes(const DWARFDIE &src_class_die, + const DWARFDIE &dst_class_die, + lldb_private::Type *class_type, + std::vector &failures); - clang::DeclContext *GetCachedClangDeclContextForDIE( - const lldb_private::plugin::dwarf::DWARFDIE &die); + clang::DeclContext *GetCachedClangDeclContextForDIE(const DWARFDIE &die); - void LinkDeclContextToDIE(clang::DeclContext *decl_ctx, - const lldb_private::plugin::dwarf::DWARFDIE &die); + void LinkDeclContextToDIE(clang::DeclContext *decl_ctx, const DWARFDIE &die); - void LinkDeclToDIE(clang::Decl *decl, - const lldb_private::plugin::dwarf::DWARFDIE &die); + void LinkDeclToDIE(clang::Decl *decl, const DWARFDIE &die); /// If \p type_sp is valid, calculate and set its symbol context scope, and /// update the type list for its backing symbol file. /// /// Returns \p type_sp. - lldb::TypeSP UpdateSymbolContextScopeForType( - const lldb_private::SymbolContext &sc, - const lldb_private::plugin::dwarf::DWARFDIE &die, lldb::TypeSP type_sp); + lldb::TypeSP + UpdateSymbolContextScopeForType(const lldb_private::SymbolContext &sc, + const DWARFDIE &die, lldb::TypeSP type_sp); /// Follow Clang Module Skeleton CU references to find a type definition. - lldb::TypeSP - ParseTypeFromClangModule(const lldb_private::SymbolContext &sc, - const lldb_private::plugin::dwarf::DWARFDIE &die, - lldb_private::Log *log); + lldb::TypeSP ParseTypeFromClangModule(const lldb_private::SymbolContext &sc, + const DWARFDIE &die, + lldb_private::Log *log); // Return true if this type is a declaration to a type in an external // module. - lldb::ModuleSP - GetModuleForType(const lldb_private::plugin::dwarf::DWARFDIE &die); + lldb::ModuleSP GetModuleForType(const DWARFDIE &die); + + void PrepareContextToReceiveMembers(clang::DeclContext *decl_ctx, + const DWARFDIE &decl_ctx_die, + const DWARFDIE &die, + const char *type_name_cstr); static bool classof(const DWARFASTParser *Parser) { return Parser->GetKind() == Kind::DWARFASTParserClang; @@ -274,10 +265,8 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { /// Parsed form of all attributes that are relevant for parsing type members. struct MemberAttributes { - explicit MemberAttributes( - const lldb_private::plugin::dwarf::DWARFDIE &die, - const lldb_private::plugin::dwarf::DWARFDIE &parent_die, - lldb::ModuleSP module_sp); + explicit MemberAttributes(const DWARFDIE &die, const DWARFDIE &parent_die, + lldb::ModuleSP module_sp); const char *name = nullptr; /// Indicates how many bits into the word (according to the host endianness) /// the low-order bit of the field starts. Can be negative. @@ -324,15 +313,12 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { /// created property. /// \param delayed_properties The list of delayed properties that the result /// will be appended to. - void - ParseObjCProperty(const lldb_private::plugin::dwarf::DWARFDIE &die, - const lldb_private::plugin::dwarf::DWARFDIE &parent_die, - const lldb_private::CompilerType &class_clang_type, - DelayedPropertyList &delayed_properties); + void ParseObjCProperty(const DWARFDIE &die, const DWARFDIE &parent_die, + const lldb_private::CompilerType &class_clang_type, + DelayedPropertyList &delayed_properties); void - ParseSingleMember(const lldb_private::plugin::dwarf::DWARFDIE &die, - const lldb_private::plugin::dwarf::DWARFDIE &parent_die, + ParseSingleMember(const DWARFDIE &die, const DWARFDIE &parent_die, const lldb_private::CompilerType &class_clang_type, lldb::AccessType default_accessibility, lldb_private::ClangASTImporter::LayoutInfo &layout_info, @@ -350,31 +336,25 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { /// \param[in] class_clang_type The parent RecordType of the static /// member this function will create. void CreateStaticMemberVariable( - const lldb_private::plugin::dwarf::DWARFDIE &die, - const MemberAttributes &attrs, + const DWARFDIE &die, const MemberAttributes &attrs, const lldb_private::CompilerType &class_clang_type); - bool CompleteRecordType(const lldb_private::plugin::dwarf::DWARFDIE &die, - lldb_private::Type *type, + bool CompleteRecordType(const DWARFDIE &die, lldb_private::Type *type, lldb_private::CompilerType &clang_type); - bool CompleteEnumType(const lldb_private::plugin::dwarf::DWARFDIE &die, - lldb_private::Type *type, + bool CompleteEnumType(const DWARFDIE &die, lldb_private::Type *type, lldb_private::CompilerType &clang_type); - lldb::TypeSP - ParseTypeModifier(const lldb_private::SymbolContext &sc, - const lldb_private::plugin::dwarf::DWARFDIE &die, - ParsedDWARFTypeAttributes &attrs); + lldb::TypeSP ParseTypeModifier(const lldb_private::SymbolContext &sc, + const DWARFDIE &die, + ParsedDWARFTypeAttributes &attrs); lldb::TypeSP ParseEnum(const lldb_private::SymbolContext &sc, - const lldb_private::plugin::dwarf::DWARFDIE &die, - ParsedDWARFTypeAttributes &attrs); - lldb::TypeSP ParseSubroutine(const lldb_private::plugin::dwarf::DWARFDIE &die, + const DWARFDIE &die, ParsedDWARFTypeAttributes &attrs); + lldb::TypeSP ParseSubroutine(const DWARFDIE &die, const ParsedDWARFTypeAttributes &attrs); - lldb::TypeSP ParseArrayType(const lldb_private::plugin::dwarf::DWARFDIE &die, + lldb::TypeSP ParseArrayType(const DWARFDIE &die, const ParsedDWARFTypeAttributes &attrs); - lldb::TypeSP - ParsePointerToMemberType(const lldb_private::plugin::dwarf::DWARFDIE &die, - const ParsedDWARFTypeAttributes &attrs); + lldb::TypeSP ParsePointerToMemberType(const DWARFDIE &die, + const ParsedDWARFTypeAttributes &attrs); /// Parses a DW_TAG_inheritance DIE into a base/super class. /// @@ -391,8 +371,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { /// \param layout_info The layout information that will be updated for C++ /// base classes with the base offset. void ParseInheritance( - const lldb_private::plugin::dwarf::DWARFDIE &die, - const lldb_private::plugin::dwarf::DWARFDIE &parent_die, + const DWARFDIE &die, const DWARFDIE &parent_die, const lldb_private::CompilerType class_clang_type, const lldb::AccessType default_accessibility, const lldb::ModuleSP &module_sp, @@ -409,8 +388,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { /// \param layout_info The layout information that will be updated for // base classes with the base offset void - ParseRustVariantPart(lldb_private::plugin::dwarf::DWARFDIE &die, - const lldb_private::plugin::dwarf::DWARFDIE &parent_die, + ParseRustVariantPart(DWARFDIE &die, const DWARFDIE &parent_die, lldb_private::CompilerType &class_clang_type, const lldb::AccessType default_accesibility, lldb_private::ClangASTImporter::LayoutInfo &layout_info); @@ -420,8 +398,9 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { /// Some attributes are relevant for all kinds of types (declaration), while /// others are only meaningful to a specific type (is_virtual) struct ParsedDWARFTypeAttributes { - explicit ParsedDWARFTypeAttributes( - const lldb_private::plugin::dwarf::DWARFDIE &die); + typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE; + + explicit ParsedDWARFTypeAttributes(const DWARFDIE &die); lldb::AccessType accessibility = lldb::eAccessNone; bool is_artificial = false; @@ -438,7 +417,7 @@ struct ParsedDWARFTypeAttributes { const char *mangled_name = nullptr; lldb_private::ConstString name; lldb_private::Declaration decl; - lldb_private::plugin::dwarf::DWARFDIE object_pointer; + DWARFDIE object_pointer; lldb_private::plugin::dwarf::DWARFFormValue abstract_origin; lldb_private::plugin::dwarf::DWARFFormValue containing_type; lldb_private::plugin::dwarf::DWARFFormValue signature; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp index 79400e36e04f3f..c98e5481609dea 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp @@ -87,6 +87,10 @@ bool DebugNamesDWARFIndex::ProcessEntry( DWARFDIE die = dwarf.GetDIE(*ref); if (!die) return true; + // Clang erroneously emits index entries for declaration DIEs in case when the + // definition is in a type unit (llvm.org/pr77696). Weed those out. + if (die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0)) + return true; return callback(die); } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index f6f152726bf74e..bc489e5b8ad465 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -481,6 +481,13 @@ static ConstString GetDWARFMachOSegmentName() { return g_dwarf_section_name; } +llvm::DenseMap & +SymbolFileDWARF::GetForwardDeclCompilerTypeToDIE() { + if (SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile()) + return debug_map_symfile->GetForwardDeclCompilerTypeToDIE(); + return m_forward_decl_compiler_type_to_die; +} + UniqueDWARFASTTypeMap &SymbolFileDWARF::GetUniqueDWARFASTTypeMap() { SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile(); if (debug_map_symfile) @@ -1632,27 +1639,33 @@ bool SymbolFileDWARF::CompleteType(CompilerType &compiler_type) { return true; } - DWARFDIE dwarf_die = GetDIE(die_it->getSecond()); - if (dwarf_die) { - // Once we start resolving this type, remove it from the forward - // declaration map in case anyone child members or other types require this - // type to get resolved. The type will get resolved when all of the calls - // to SymbolFileDWARF::ResolveClangOpaqueTypeDefinition are done. - GetForwardDeclCompilerTypeToDIE().erase(die_it); - - Type *type = GetDIEToType().lookup(dwarf_die.GetDIE()); + // Once we start resolving this type, remove it from the forward + // declaration map in case anyone's child members or other types require this + // type to get resolved. + DWARFDIE dwarf_die = GetDIE(die_it->second); + GetForwardDeclCompilerTypeToDIE().erase(die_it); + Type *type = nullptr; + if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU())) + type = dwarf_ast->FindDefinitionTypeForDIE(dwarf_die); + if (!type) + return false; - Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion); - if (log) - GetObjectFile()->GetModule()->LogMessageVerboseBacktrace( - log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...", - dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()), - dwarf_die.Tag(), type->GetName().AsCString()); - assert(compiler_type); - if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU())) - return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type); + die_it = GetForwardDeclCompilerTypeToDIE().find( + compiler_type_no_qualifiers.GetOpaqueQualType()); + if (die_it != GetForwardDeclCompilerTypeToDIE().end()) { + dwarf_die = GetDIE(die_it->getSecond()); + GetForwardDeclCompilerTypeToDIE().erase(die_it); } - return false; + + if (Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion)) + GetObjectFile()->GetModule()->LogMessageVerboseBacktrace( + log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...", + dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()), + dwarf_die.Tag(), type->GetName().AsCString()); + assert(compiler_type); + if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU())) + return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type); + return true; } Type *SymbolFileDWARF::ResolveType(const DWARFDIE &die, diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h index 7282c08c6857c9..35893f2072dd64 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h @@ -335,12 +335,8 @@ class SymbolFileDWARF : public SymbolFileCommon { virtual DIEToTypePtr &GetDIEToType() { return m_die_to_type; } - typedef llvm::DenseMap - CompilerTypeToDIE; - - virtual CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() { - return m_forward_decl_compiler_type_to_die; - } + virtual llvm::DenseMap & + GetForwardDeclCompilerTypeToDIE(); typedef llvm::DenseMap DIEToVariableSP; @@ -533,9 +529,14 @@ class SymbolFileDWARF : public SymbolFileCommon { NameToOffsetMap m_function_scope_qualified_name_map; std::unique_ptr m_ranges; UniqueDWARFASTTypeMap m_unique_ast_type_map; + // A map from DIE to lldb_private::Type. For record type, the key might be + // either declaration DIE or definition DIE. DIEToTypePtr m_die_to_type; DIEToVariableSP m_die_to_variable_sp; - CompilerTypeToDIE m_forward_decl_compiler_type_to_die; + // A map from CompilerType to the struct/class/union/enum DIE (might be a + // declaration or a definition) that is used to construct it. + llvm::DenseMap + m_forward_decl_compiler_type_to_die; llvm::DenseMap> m_type_unit_support_files; std::vector m_lldb_cu_to_dwarf_unit; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h index de22dd676eef0a..d7d571919bc7d6 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h @@ -284,6 +284,11 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon { lldb::TypeSP FindCompleteObjCDefinitionTypeForDIE( const DWARFDIE &die, ConstString type_name, bool must_be_implementation); + llvm::DenseMap & + GetForwardDeclCompilerTypeToDIE() { + return m_forward_decl_compiler_type_to_die; + } + UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() { return m_unique_ast_type_map; } @@ -321,6 +326,10 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon { std::vector m_func_indexes; // Sorted by address std::vector m_glob_indexes; std::map>, OSOInfoSP> m_oso_map; + // A map from CompilerType to the struct/class/union/enum DIE (might be a + // declaration or a definition) that is used to construct it. + llvm::DenseMap + m_forward_decl_compiler_type_to_die; UniqueDWARFASTTypeMap m_unique_ast_type_map; LazyBool m_supports_DW_AT_APPLE_objc_complete_type; DebugMap m_debug_map; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp index 85e1afd0d89761..8fd369c65f86b6 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp @@ -110,7 +110,7 @@ SymbolFileDWARF::DIEToVariableSP &SymbolFileDWARFDwo::GetDIEToVariable() { return GetBaseSymbolFile().GetDIEToVariable(); } -SymbolFileDWARF::CompilerTypeToDIE & +llvm::DenseMap & SymbolFileDWARFDwo::GetForwardDeclCompilerTypeToDIE() { return GetBaseSymbolFile().GetForwardDeclCompilerTypeToDIE(); } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h index 1500540424b524..2f0ac415e90d40 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h @@ -72,7 +72,8 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF { DIEToVariableSP &GetDIEToVariable() override; - CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() override; + llvm::DenseMap & + GetForwardDeclCompilerTypeToDIE() override; UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() override; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp index 223518f0ae8241..4762356034cab7 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp @@ -13,66 +13,67 @@ using namespace lldb_private::dwarf; using namespace lldb_private::plugin::dwarf; -bool UniqueDWARFASTTypeList::Find(const DWARFDIE &die, - const lldb_private::Declaration &decl, - const int32_t byte_size, - UniqueDWARFASTType &entry) const { - for (const UniqueDWARFASTType &udt : m_collection) { +UniqueDWARFASTType *UniqueDWARFASTTypeList::Find( + const DWARFDIE &die, const lldb_private::Declaration &decl, + const int32_t byte_size, bool is_forward_declaration) { + for (UniqueDWARFASTType &udt : m_collection) { // Make sure the tags match if (udt.m_die.Tag() == die.Tag()) { - // Validate byte sizes of both types only if both are valid. - if (udt.m_byte_size < 0 || byte_size < 0 || - udt.m_byte_size == byte_size) { - // Make sure the file and line match - if (udt.m_declaration == decl) { - // The type has the same name, and was defined on the same file and - // line. Now verify all of the parent DIEs match. - DWARFDIE parent_arg_die = die.GetParent(); - DWARFDIE parent_pos_die = udt.m_die.GetParent(); - bool match = true; - bool done = false; - while (!done && match && parent_arg_die && parent_pos_die) { - const dw_tag_t parent_arg_tag = parent_arg_die.Tag(); - const dw_tag_t parent_pos_tag = parent_pos_die.Tag(); - if (parent_arg_tag == parent_pos_tag) { - switch (parent_arg_tag) { - case DW_TAG_class_type: - case DW_TAG_structure_type: - case DW_TAG_union_type: - case DW_TAG_namespace: { - const char *parent_arg_die_name = parent_arg_die.GetName(); - if (parent_arg_die_name == - nullptr) // Anonymous (i.e. no-name) struct - { - match = false; - } else { - const char *parent_pos_die_name = parent_pos_die.GetName(); - if (parent_pos_die_name == nullptr || - ((parent_arg_die_name != parent_pos_die_name) && - strcmp(parent_arg_die_name, parent_pos_die_name))) - match = false; - } - } break; - - case DW_TAG_compile_unit: - case DW_TAG_partial_unit: - done = true; - break; - default: - break; - } + // If they are not both definition DIEs or both declaration DIEs, then + // don't check for byte size and declaration location, because declaration + // DIEs usually don't have those info. + bool matching_size_declaration = + udt.m_is_forward_declaration != is_forward_declaration + ? true + : (udt.m_byte_size < 0 || byte_size < 0 || + udt.m_byte_size == byte_size) && + udt.m_declaration == decl; + if (!matching_size_declaration) + continue; + // The type has the same name, and was defined on the same file and + // line. Now verify all of the parent DIEs match. + DWARFDIE parent_arg_die = die.GetParent(); + DWARFDIE parent_pos_die = udt.m_die.GetParent(); + bool match = true; + bool done = false; + while (!done && match && parent_arg_die && parent_pos_die) { + const dw_tag_t parent_arg_tag = parent_arg_die.Tag(); + const dw_tag_t parent_pos_tag = parent_pos_die.Tag(); + if (parent_arg_tag == parent_pos_tag) { + switch (parent_arg_tag) { + case DW_TAG_class_type: + case DW_TAG_structure_type: + case DW_TAG_union_type: + case DW_TAG_namespace: { + const char *parent_arg_die_name = parent_arg_die.GetName(); + if (parent_arg_die_name == nullptr) { + // Anonymous (i.e. no-name) struct + match = false; + } else { + const char *parent_pos_die_name = parent_pos_die.GetName(); + if (parent_pos_die_name == nullptr || + ((parent_arg_die_name != parent_pos_die_name) && + strcmp(parent_arg_die_name, parent_pos_die_name))) + match = false; } - parent_arg_die = parent_arg_die.GetParent(); - parent_pos_die = parent_pos_die.GetParent(); - } + } break; - if (match) { - entry = udt; - return true; + case DW_TAG_compile_unit: + case DW_TAG_partial_unit: + done = true; + break; + default: + break; } } + parent_arg_die = parent_arg_die.GetParent(); + parent_pos_die = parent_pos_die.GetParent(); + } + + if (match) { + return &udt; } } } - return false; + return nullptr; } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h index bf3cbae55e5c7b..29e5c02dcbe176 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h @@ -23,31 +23,19 @@ class UniqueDWARFASTType { // Constructors and Destructors UniqueDWARFASTType() : m_type_sp(), m_die(), m_declaration() {} - UniqueDWARFASTType(lldb::TypeSP &type_sp, const DWARFDIE &die, - const Declaration &decl, int32_t byte_size) - : m_type_sp(type_sp), m_die(die), m_declaration(decl), - m_byte_size(byte_size) {} - UniqueDWARFASTType(const UniqueDWARFASTType &rhs) : m_type_sp(rhs.m_type_sp), m_die(rhs.m_die), - m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size) {} + m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size), + m_is_forward_declaration(rhs.m_is_forward_declaration) {} ~UniqueDWARFASTType() = default; - UniqueDWARFASTType &operator=(const UniqueDWARFASTType &rhs) { - if (this != &rhs) { - m_type_sp = rhs.m_type_sp; - m_die = rhs.m_die; - m_declaration = rhs.m_declaration; - m_byte_size = rhs.m_byte_size; - } - return *this; - } - lldb::TypeSP m_type_sp; DWARFDIE m_die; Declaration m_declaration; int32_t m_byte_size = -1; + // True if the m_die is a forward declaration DIE. + bool m_is_forward_declaration = true; }; class UniqueDWARFASTTypeList { @@ -62,8 +50,9 @@ class UniqueDWARFASTTypeList { m_collection.push_back(entry); } - bool Find(const DWARFDIE &die, const Declaration &decl, - const int32_t byte_size, UniqueDWARFASTType &entry) const; + UniqueDWARFASTType *Find(const DWARFDIE &die, const Declaration &decl, + const int32_t byte_size, + bool is_forward_declaration); protected: typedef std::vector collection; @@ -80,14 +69,15 @@ class UniqueDWARFASTTypeMap { m_collection[name.GetCString()].Append(entry); } - bool Find(ConstString name, const DWARFDIE &die, const Declaration &decl, - const int32_t byte_size, UniqueDWARFASTType &entry) const { + UniqueDWARFASTType *Find(ConstString name, const DWARFDIE &die, + const Declaration &decl, const int32_t byte_size, + bool is_forward_declaration) { const char *unique_name_cstr = name.GetCString(); - collection::const_iterator pos = m_collection.find(unique_name_cstr); + collection::iterator pos = m_collection.find(unique_name_cstr); if (pos != m_collection.end()) { - return pos->second.Find(die, decl, byte_size, entry); + return pos->second.Find(die, decl, byte_size, is_forward_declaration); } - return false; + return nullptr; } protected: diff --git a/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test new file mode 100644 index 00000000000000..d253981b498c81 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test @@ -0,0 +1,36 @@ +# Test definition DIE searching is delayed until complete type is required. + +# UNSUPPORTED: system-windows + +# RUN: split-file %s %t +# RUN: %clangxx_host %t/main.cpp %t/t1_def.cpp -gdwarf -o %t.out +# RUN: %lldb -b %t.out -s %t/lldb.cmd | FileCheck %s + +# CHECK: (lldb) p v1 +# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't2' +# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1' +# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't2' resolving forward declaration... +# CHECK: (t2) {} +# CHECK: (lldb) p v2 +# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1' +# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't1' resolving forward declaration... + +#--- lldb.cmd +log enable dwarf comp +p v1 +p v2 + +#--- main.cpp +template +struct t2 { +}; +struct t1; +t2 v1; // this CU doesn't have definition DIE for t1, but only declaration DIE for it. +int main() { +} + +#--- t1_def.cpp +struct t1 { // this CU contains definition DIE for t1. + int x; +}; +t1 v2; From d490ce22e93db2e9d57985bc50915e383327911f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 08:51:42 -0700 Subject: [PATCH 24/89] [RISCV] Use mask undisturbed policy when silencing sNans for strict rounding ops. (#93356) The elements that aren't sNans need to get passed through this fadd instruction unchanged. With the agnostic mask policy they might be forced to all ones. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- .../RISCV/rvv/fceil-constrained-sdnode.ll | 45 ++++++++++++------- .../RISCV/rvv/ffloor-constrained-sdnode.ll | 45 ++++++++++++------- .../fixed-vectors-fceil-constrained-sdnode.ll | 45 ++++++++++++------- ...fixed-vectors-ffloor-constrained-sdnode.ll | 45 ++++++++++++------- ...d-vectors-fnearbyint-constrained-sdnode.ll | 36 ++++++++++----- ...fixed-vectors-fround-constrained-sdnode.ll | 45 ++++++++++++------- ...d-vectors-froundeven-constrained-sdnode.ll | 45 ++++++++++++------- ...fixed-vectors-ftrunc-constrained-sdnode.ll | 45 ++++++++++++------- .../rvv/fnearbyint-constrained-sdnode.ll | 45 ++++++++++++------- .../RISCV/rvv/fround-constrained-sdnode.ll | 45 ++++++++++++------- .../rvv/froundeven-constrained-sdnode.ll | 45 ++++++++++++------- .../RISCV/rvv/ftrunc-constrained-sdnode.ll | 45 ++++++++++++------- 13 files changed, 355 insertions(+), 178 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f0e5a7d393b6c9..c826892c1668ec 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3125,7 +3125,7 @@ lowerVectorStrictFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG, Chain = Unorder.getValue(1); Src = DAG.getNode(RISCVISD::STRICT_FADD_VL, DL, DAG.getVTList(ContainerVT, MVT::Other), - {Chain, Src, Src, DAG.getUNDEF(ContainerVT), Unorder, VL}); + {Chain, Src, Src, Src, Unorder, VL}); Chain = Src.getValue(1); // We do the conversion on the absolute value and fix the sign at the end. diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll index 75747a6674b7b4..d8781495abd75c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll @@ -7,7 +7,7 @@ define @ceil_nxv1f16( %x) strictfp { ; CHECK-LABEL: ceil_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -15,6 +15,7 @@ define @ceil_nxv1f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -29,7 +30,7 @@ declare @llvm.experimental.constrained.ceil.nxv1f16( @ceil_nxv2f16( %x) strictfp { ; CHECK-LABEL: ceil_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -37,6 +38,7 @@ define @ceil_nxv2f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -51,7 +53,7 @@ declare @llvm.experimental.constrained.ceil.nxv2f16( @ceil_nxv4f16( %x) strictfp { ; CHECK-LABEL: ceil_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -59,6 +61,7 @@ define @ceil_nxv4f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -73,7 +76,7 @@ declare @llvm.experimental.constrained.ceil.nxv4f16( @ceil_nxv8f16( %x) strictfp { ; CHECK-LABEL: ceil_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -81,6 +84,7 @@ define @ceil_nxv8f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -95,7 +99,7 @@ declare @llvm.experimental.constrained.ceil.nxv8f16( @ceil_nxv16f16( %x) strictfp { ; CHECK-LABEL: ceil_nxv16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -103,6 +107,7 @@ define @ceil_nxv16f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -117,7 +122,7 @@ declare @llvm.experimental.constrained.ceil.nxv16f16( @ceil_nxv32f16( %x) strictfp { ; CHECK-LABEL: ceil_nxv32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) @@ -125,6 +130,7 @@ define @ceil_nxv32f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -139,7 +145,7 @@ declare @llvm.experimental.constrained.ceil.nxv32f16( @ceil_nxv1f32( %x) strictfp { ; CHECK-LABEL: ceil_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -147,6 +153,7 @@ define @ceil_nxv1f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -161,7 +168,7 @@ declare @llvm.experimental.constrained.ceil.nxv1f32( @ceil_nxv2f32( %x) strictfp { ; CHECK-LABEL: ceil_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -169,6 +176,7 @@ define @ceil_nxv2f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -183,7 +191,7 @@ declare @llvm.experimental.constrained.ceil.nxv2f32( @ceil_nxv4f32( %x) strictfp { ; CHECK-LABEL: ceil_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -191,6 +199,7 @@ define @ceil_nxv4f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -205,7 +214,7 @@ declare @llvm.experimental.constrained.ceil.nxv4f32( @ceil_nxv8f32( %x) strictfp { ; CHECK-LABEL: ceil_nxv8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -213,6 +222,7 @@ define @ceil_nxv8f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -227,7 +237,7 @@ declare @llvm.experimental.constrained.ceil.nxv8f32( @ceil_nxv16f32( %x) strictfp { ; CHECK-LABEL: ceil_nxv16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v16, v8 @@ -235,6 +245,7 @@ define @ceil_nxv16f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -249,7 +260,7 @@ declare @llvm.experimental.constrained.ceil.nxv16f32( @ceil_nxv1f64( %x) strictfp { ; CHECK-LABEL: ceil_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -257,6 +268,7 @@ define @ceil_nxv1f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -271,7 +283,7 @@ declare @llvm.experimental.constrained.ceil.nxv1f64( @ceil_nxv2f64( %x) strictfp { ; CHECK-LABEL: ceil_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) @@ -279,6 +291,7 @@ define @ceil_nxv2f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -293,7 +306,7 @@ declare @llvm.experimental.constrained.ceil.nxv2f64( @ceil_nxv4f64( %x) strictfp { ; CHECK-LABEL: ceil_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) @@ -301,6 +314,7 @@ define @ceil_nxv4f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -315,7 +329,7 @@ declare @llvm.experimental.constrained.ceil.nxv4f64( @ceil_nxv8f64( %x) strictfp { ; CHECK-LABEL: ceil_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) @@ -323,6 +337,7 @@ define @ceil_nxv8f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll index 31a94532044574..1df452d8641c58 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll @@ -7,7 +7,7 @@ define @floor_nxv1f16( %x) strictfp { ; CHECK-LABEL: floor_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -15,6 +15,7 @@ define @floor_nxv1f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -29,7 +30,7 @@ declare @llvm.experimental.constrained.floor.nxv1f16( @floor_nxv2f16( %x) strictfp { ; CHECK-LABEL: floor_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -37,6 +38,7 @@ define @floor_nxv2f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -51,7 +53,7 @@ declare @llvm.experimental.constrained.floor.nxv2f16( @floor_nxv4f16( %x) strictfp { ; CHECK-LABEL: floor_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -59,6 +61,7 @@ define @floor_nxv4f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -73,7 +76,7 @@ declare @llvm.experimental.constrained.floor.nxv4f16( @floor_nxv8f16( %x) strictfp { ; CHECK-LABEL: floor_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -81,6 +84,7 @@ define @floor_nxv8f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -95,7 +99,7 @@ declare @llvm.experimental.constrained.floor.nxv8f16( @floor_nxv16f16( %x) strictfp { ; CHECK-LABEL: floor_nxv16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -103,6 +107,7 @@ define @floor_nxv16f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -117,7 +122,7 @@ declare @llvm.experimental.constrained.floor.nxv16f16( @floor_nxv32f16( %x) strictfp { ; CHECK-LABEL: floor_nxv32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) @@ -125,6 +130,7 @@ define @floor_nxv32f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -139,7 +145,7 @@ declare @llvm.experimental.constrained.floor.nxv32f16( @floor_nxv1f32( %x) strictfp { ; CHECK-LABEL: floor_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -147,6 +153,7 @@ define @floor_nxv1f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -161,7 +168,7 @@ declare @llvm.experimental.constrained.floor.nxv1f32( @floor_nxv2f32( %x) strictfp { ; CHECK-LABEL: floor_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -169,6 +176,7 @@ define @floor_nxv2f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -183,7 +191,7 @@ declare @llvm.experimental.constrained.floor.nxv2f32( @floor_nxv4f32( %x) strictfp { ; CHECK-LABEL: floor_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -191,6 +199,7 @@ define @floor_nxv4f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -205,7 +214,7 @@ declare @llvm.experimental.constrained.floor.nxv4f32( @floor_nxv8f32( %x) strictfp { ; CHECK-LABEL: floor_nxv8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -213,6 +222,7 @@ define @floor_nxv8f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -227,7 +237,7 @@ declare @llvm.experimental.constrained.floor.nxv8f32( @floor_nxv16f32( %x) strictfp { ; CHECK-LABEL: floor_nxv16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v16, v8 @@ -235,6 +245,7 @@ define @floor_nxv16f32( %x) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -249,7 +260,7 @@ declare @llvm.experimental.constrained.floor.nxv16f32( @floor_nxv1f64( %x) strictfp { ; CHECK-LABEL: floor_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -257,6 +268,7 @@ define @floor_nxv1f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -271,7 +283,7 @@ declare @llvm.experimental.constrained.floor.nxv1f64( @floor_nxv2f64( %x) strictfp { ; CHECK-LABEL: floor_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) @@ -279,6 +291,7 @@ define @floor_nxv2f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -293,7 +306,7 @@ declare @llvm.experimental.constrained.floor.nxv2f64( @floor_nxv4f64( %x) strictfp { ; CHECK-LABEL: floor_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) @@ -301,6 +314,7 @@ define @floor_nxv4f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -315,7 +329,7 @@ declare @llvm.experimental.constrained.floor.nxv4f64( @floor_nxv8f64( %x) strictfp { ; CHECK-LABEL: floor_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) @@ -323,6 +337,7 @@ define @floor_nxv8f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll index 1e93a73ede5d65..404fb72b8abe91 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll @@ -7,7 +7,7 @@ define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp { ; CHECK-LABEL: ceil_v1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -15,6 +15,7 @@ define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -29,7 +30,7 @@ declare <1 x half> @llvm.experimental.constrained.ceil.v1f16(<1 x half>, metadat define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp { ; CHECK-LABEL: ceil_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -37,6 +38,7 @@ define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -51,7 +53,7 @@ declare <2 x half> @llvm.experimental.constrained.ceil.v2f16(<2 x half>, metadat define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp { ; CHECK-LABEL: ceil_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -59,6 +61,7 @@ define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -73,7 +76,7 @@ declare <4 x half> @llvm.experimental.constrained.ceil.v4f16(<4 x half>, metadat define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp { ; CHECK-LABEL: ceil_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -81,6 +84,7 @@ define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -95,7 +99,7 @@ declare <8 x half> @llvm.experimental.constrained.ceil.v8f16(<8 x half>, metadat define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp { ; CHECK-LABEL: ceil_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -103,6 +107,7 @@ define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -118,7 +123,7 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: ceil_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) @@ -126,6 +131,7 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -140,7 +146,7 @@ declare <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half>, meta define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp { ; CHECK-LABEL: ceil_v1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -148,6 +154,7 @@ define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -162,7 +169,7 @@ declare <1 x float> @llvm.experimental.constrained.ceil.v1f32(<1 x float>, metad define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp { ; CHECK-LABEL: ceil_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -170,6 +177,7 @@ define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -184,7 +192,7 @@ declare <2 x float> @llvm.experimental.constrained.ceil.v2f32(<2 x float>, metad define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp { ; CHECK-LABEL: ceil_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -192,6 +200,7 @@ define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -206,7 +215,7 @@ declare <4 x float> @llvm.experimental.constrained.ceil.v4f32(<4 x float>, metad define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp { ; CHECK-LABEL: ceil_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -214,6 +223,7 @@ define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -228,7 +238,7 @@ declare <8 x float> @llvm.experimental.constrained.ceil.v8f32(<8 x float>, metad define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp { ; CHECK-LABEL: ceil_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -236,6 +246,7 @@ define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -250,7 +261,7 @@ declare <16 x float> @llvm.experimental.constrained.ceil.v16f32(<16 x float>, me define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp { ; CHECK-LABEL: ceil_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -258,6 +269,7 @@ define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -272,7 +284,7 @@ declare <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double>, met define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp { ; CHECK-LABEL: ceil_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) @@ -280,6 +292,7 @@ define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -294,7 +307,7 @@ declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, met define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp { ; CHECK-LABEL: ceil_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) @@ -302,6 +315,7 @@ define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -316,7 +330,7 @@ declare <4 x double> @llvm.experimental.constrained.ceil.v4f64(<4 x double>, met define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp { ; CHECK-LABEL: ceil_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) @@ -324,6 +338,7 @@ define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll index 53018939fc6eb4..2319aab370d2de 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll @@ -7,7 +7,7 @@ define <1 x half> @floor_v1f16(<1 x half> %x) strictfp { ; CHECK-LABEL: floor_v1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -15,6 +15,7 @@ define <1 x half> @floor_v1f16(<1 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -29,7 +30,7 @@ declare <1 x half> @llvm.experimental.constrained.floor.v1f16(<1 x half>, metada define <2 x half> @floor_v2f16(<2 x half> %x) strictfp { ; CHECK-LABEL: floor_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -37,6 +38,7 @@ define <2 x half> @floor_v2f16(<2 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -51,7 +53,7 @@ declare <2 x half> @llvm.experimental.constrained.floor.v2f16(<2 x half>, metada define <4 x half> @floor_v4f16(<4 x half> %x) strictfp { ; CHECK-LABEL: floor_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -59,6 +61,7 @@ define <4 x half> @floor_v4f16(<4 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -73,7 +76,7 @@ declare <4 x half> @llvm.experimental.constrained.floor.v4f16(<4 x half>, metada define <8 x half> @floor_v8f16(<8 x half> %x) strictfp { ; CHECK-LABEL: floor_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -81,6 +84,7 @@ define <8 x half> @floor_v8f16(<8 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -95,7 +99,7 @@ declare <8 x half> @llvm.experimental.constrained.floor.v8f16(<8 x half>, metada define <16 x half> @floor_v16f16(<16 x half> %x) strictfp { ; CHECK-LABEL: floor_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -103,6 +107,7 @@ define <16 x half> @floor_v16f16(<16 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -118,7 +123,7 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: floor_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) @@ -126,6 +131,7 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -140,7 +146,7 @@ declare <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half>, met define <1 x float> @floor_v1f32(<1 x float> %x) strictfp { ; CHECK-LABEL: floor_v1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -148,6 +154,7 @@ define <1 x float> @floor_v1f32(<1 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -162,7 +169,7 @@ declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, meta define <2 x float> @floor_v2f32(<2 x float> %x) strictfp { ; CHECK-LABEL: floor_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -170,6 +177,7 @@ define <2 x float> @floor_v2f32(<2 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -184,7 +192,7 @@ declare <2 x float> @llvm.experimental.constrained.floor.v2f32(<2 x float>, meta define <4 x float> @floor_v4f32(<4 x float> %x) strictfp { ; CHECK-LABEL: floor_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -192,6 +200,7 @@ define <4 x float> @floor_v4f32(<4 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -206,7 +215,7 @@ declare <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float>, meta define <8 x float> @floor_v8f32(<8 x float> %x) strictfp { ; CHECK-LABEL: floor_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -214,6 +223,7 @@ define <8 x float> @floor_v8f32(<8 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -228,7 +238,7 @@ declare <8 x float> @llvm.experimental.constrained.floor.v8f32(<8 x float>, meta define <16 x float> @floor_v16f32(<16 x float> %x) strictfp { ; CHECK-LABEL: floor_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -236,6 +246,7 @@ define <16 x float> @floor_v16f32(<16 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -250,7 +261,7 @@ declare <16 x float> @llvm.experimental.constrained.floor.v16f32(<16 x float>, m define <1 x double> @floor_v1f64(<1 x double> %x) strictfp { ; CHECK-LABEL: floor_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -258,6 +269,7 @@ define <1 x double> @floor_v1f64(<1 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -272,7 +284,7 @@ declare <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double>, me define <2 x double> @floor_v2f64(<2 x double> %x) strictfp { ; CHECK-LABEL: floor_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) @@ -280,6 +292,7 @@ define <2 x double> @floor_v2f64(<2 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -294,7 +307,7 @@ declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, me define <4 x double> @floor_v4f64(<4 x double> %x) strictfp { ; CHECK-LABEL: floor_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) @@ -302,6 +315,7 @@ define <4 x double> @floor_v4f64(<4 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -316,7 +330,7 @@ declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, me define <8 x double> @floor_v8f64(<8 x double> %x) strictfp { ; CHECK-LABEL: floor_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) @@ -324,6 +338,7 @@ define <8 x double> @floor_v8f64(<8 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll index 9e9a8b8a4b644e..719dd524942846 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll @@ -9,7 +9,7 @@ declare <2 x half> @llvm.experimental.constrained.nearbyint.v2f16(<2 x half>, me define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp { ; CHECK-LABEL: nearbyint_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -17,6 +17,7 @@ define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -32,7 +33,7 @@ declare <4 x half> @llvm.experimental.constrained.nearbyint.v4f16(<4 x half>, me define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp { ; CHECK-LABEL: nearbyint_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -40,6 +41,7 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -55,7 +57,7 @@ declare <8 x half> @llvm.experimental.constrained.nearbyint.v8f16(<8 x half>, me define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp { ; CHECK-LABEL: nearbyint_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -63,6 +65,7 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -78,7 +81,7 @@ declare <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half>, define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp { ; CHECK-LABEL: nearbyint_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -86,6 +89,7 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -102,7 +106,7 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp { ; CHECK-LABEL: nearbyint_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -110,6 +114,7 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -125,7 +130,7 @@ declare <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float>, define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp { ; CHECK-LABEL: nearbyint_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -133,6 +138,7 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu @@ -148,7 +154,7 @@ declare <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float>, define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp { ; CHECK-LABEL: nearbyint_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -156,6 +162,7 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu @@ -171,7 +178,7 @@ declare <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float>, define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp { ; CHECK-LABEL: nearbyint_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -179,6 +186,7 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu @@ -194,7 +202,7 @@ declare <16 x float> @llvm.experimental.constrained.nearbyint.v16f32(<16 x float define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp { ; CHECK-LABEL: nearbyint_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -202,6 +210,7 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu @@ -217,7 +226,7 @@ declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp { ; CHECK-LABEL: nearbyint_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI9_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI9_0)(a0) @@ -225,6 +234,7 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -240,7 +250,7 @@ declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double> define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp { ; CHECK-LABEL: nearbyint_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI10_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI10_0)(a0) @@ -248,6 +258,7 @@ define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -263,7 +274,7 @@ declare <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double> define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp { ; CHECK-LABEL: nearbyint_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -271,6 +282,7 @@ define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll index f189354237ee3a..e855d9504ff404 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll @@ -9,7 +9,7 @@ define <1 x half> @round_v1f16(<1 x half> %x) strictfp { ; CHECK-LABEL: round_v1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -17,6 +17,7 @@ define <1 x half> @round_v1f16(<1 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -31,7 +32,7 @@ declare <1 x half> @llvm.experimental.constrained.round.v1f16(<1 x half>, metada define <2 x half> @round_v2f16(<2 x half> %x) strictfp { ; CHECK-LABEL: round_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -39,6 +40,7 @@ define <2 x half> @round_v2f16(<2 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -53,7 +55,7 @@ declare <2 x half> @llvm.experimental.constrained.round.v2f16(<2 x half>, metada define <4 x half> @round_v4f16(<4 x half> %x) strictfp { ; CHECK-LABEL: round_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -61,6 +63,7 @@ define <4 x half> @round_v4f16(<4 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -75,7 +78,7 @@ declare <4 x half> @llvm.experimental.constrained.round.v4f16(<4 x half>, metada define <8 x half> @round_v8f16(<8 x half> %x) strictfp { ; CHECK-LABEL: round_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -83,6 +86,7 @@ define <8 x half> @round_v8f16(<8 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -97,7 +101,7 @@ declare <8 x half> @llvm.experimental.constrained.round.v8f16(<8 x half>, metada define <16 x half> @round_v16f16(<16 x half> %x) strictfp { ; CHECK-LABEL: round_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -105,6 +109,7 @@ define <16 x half> @round_v16f16(<16 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -120,7 +125,7 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: round_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) @@ -128,6 +133,7 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -142,7 +148,7 @@ declare <32 x half> @llvm.experimental.constrained.round.v32f16(<32 x half>, met define <1 x float> @round_v1f32(<1 x float> %x) strictfp { ; CHECK-LABEL: round_v1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -150,6 +156,7 @@ define <1 x float> @round_v1f32(<1 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -164,7 +171,7 @@ declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, meta define <2 x float> @round_v2f32(<2 x float> %x) strictfp { ; CHECK-LABEL: round_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -172,6 +179,7 @@ define <2 x float> @round_v2f32(<2 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -186,7 +194,7 @@ declare <2 x float> @llvm.experimental.constrained.round.v2f32(<2 x float>, meta define <4 x float> @round_v4f32(<4 x float> %x) strictfp { ; CHECK-LABEL: round_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -194,6 +202,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -208,7 +217,7 @@ declare <4 x float> @llvm.experimental.constrained.round.v4f32(<4 x float>, meta define <8 x float> @round_v8f32(<8 x float> %x) strictfp { ; CHECK-LABEL: round_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -216,6 +225,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -230,7 +240,7 @@ declare <8 x float> @llvm.experimental.constrained.round.v8f32(<8 x float>, meta define <16 x float> @round_v16f32(<16 x float> %x) strictfp { ; CHECK-LABEL: round_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -238,6 +248,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -252,7 +263,7 @@ declare <16 x float> @llvm.experimental.constrained.round.v16f32(<16 x float>, m define <1 x double> @round_v1f64(<1 x double> %x) strictfp { ; CHECK-LABEL: round_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -260,6 +271,7 @@ define <1 x double> @round_v1f64(<1 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -274,7 +286,7 @@ declare <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double>, me define <2 x double> @round_v2f64(<2 x double> %x) strictfp { ; CHECK-LABEL: round_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) @@ -282,6 +294,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -296,7 +309,7 @@ declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, me define <4 x double> @round_v4f64(<4 x double> %x) strictfp { ; CHECK-LABEL: round_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) @@ -304,6 +317,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -318,7 +332,7 @@ declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, me define <8 x double> @round_v8f64(<8 x double> %x) strictfp { ; CHECK-LABEL: round_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) @@ -326,6 +340,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll index 11920c7c31c981..9976cd2a8ab29a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll @@ -9,7 +9,7 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp { ; CHECK-LABEL: roundeven_v1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -17,6 +17,7 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -31,7 +32,7 @@ declare <1 x half> @llvm.experimental.constrained.roundeven.v1f16(<1 x half>, me define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp { ; CHECK-LABEL: roundeven_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -39,6 +40,7 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -53,7 +55,7 @@ declare <2 x half> @llvm.experimental.constrained.roundeven.v2f16(<2 x half>, me define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp { ; CHECK-LABEL: roundeven_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -61,6 +63,7 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -75,7 +78,7 @@ declare <4 x half> @llvm.experimental.constrained.roundeven.v4f16(<4 x half>, me define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp { ; CHECK-LABEL: roundeven_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -83,6 +86,7 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -97,7 +101,7 @@ declare <8 x half> @llvm.experimental.constrained.roundeven.v8f16(<8 x half>, me define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp { ; CHECK-LABEL: roundeven_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -105,6 +109,7 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -120,7 +125,7 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: roundeven_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) @@ -128,6 +133,7 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -142,7 +148,7 @@ declare <32 x half> @llvm.experimental.constrained.roundeven.v32f16(<32 x half>, define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp { ; CHECK-LABEL: roundeven_v1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -150,6 +156,7 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -164,7 +171,7 @@ declare <1 x float> @llvm.experimental.constrained.roundeven.v1f32(<1 x float>, define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp { ; CHECK-LABEL: roundeven_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -172,6 +179,7 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -186,7 +194,7 @@ declare <2 x float> @llvm.experimental.constrained.roundeven.v2f32(<2 x float>, define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp { ; CHECK-LABEL: roundeven_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -194,6 +202,7 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -208,7 +217,7 @@ declare <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float>, define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp { ; CHECK-LABEL: roundeven_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -216,6 +225,7 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -230,7 +240,7 @@ declare <8 x float> @llvm.experimental.constrained.roundeven.v8f32(<8 x float>, define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp { ; CHECK-LABEL: roundeven_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -238,6 +248,7 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -252,7 +263,7 @@ declare <16 x float> @llvm.experimental.constrained.roundeven.v16f32(<16 x float define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp { ; CHECK-LABEL: roundeven_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -260,6 +271,7 @@ define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -274,7 +286,7 @@ declare <1 x double> @llvm.experimental.constrained.roundeven.v1f64(<1 x double> define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp { ; CHECK-LABEL: roundeven_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) @@ -282,6 +294,7 @@ define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -296,7 +309,7 @@ declare <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp { ; CHECK-LABEL: roundeven_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) @@ -304,6 +317,7 @@ define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -318,7 +332,7 @@ declare <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double> define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp { ; CHECK-LABEL: roundeven_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) @@ -326,6 +340,7 @@ define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll index f16581444afca5..eac26451d5a8cc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll @@ -7,13 +7,14 @@ define <1 x half> @trunc_v1f16(<1 x half> %x) strictfp { ; CHECK-LABEL: trunc_v1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -27,13 +28,14 @@ declare <1 x half> @llvm.experimental.constrained.trunc.v1f16(<1 x half>, metada define <2 x half> @trunc_v2f16(<2 x half> %x) strictfp { ; CHECK-LABEL: trunc_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -47,13 +49,14 @@ declare <2 x half> @llvm.experimental.constrained.trunc.v2f16(<2 x half>, metada define <4 x half> @trunc_v4f16(<4 x half> %x) strictfp { ; CHECK-LABEL: trunc_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -67,13 +70,14 @@ declare <4 x half> @llvm.experimental.constrained.trunc.v4f16(<4 x half>, metada define <8 x half> @trunc_v8f16(<8 x half> %x) strictfp { ; CHECK-LABEL: trunc_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -87,13 +91,14 @@ declare <8 x half> @llvm.experimental.constrained.trunc.v8f16(<8 x half>, metada define <16 x half> @trunc_v16f16(<16 x half> %x) strictfp { ; CHECK-LABEL: trunc_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -108,13 +113,14 @@ define <32 x half> @trunc_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: trunc_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -128,13 +134,14 @@ declare <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half>, met define <1 x float> @trunc_v1f32(<1 x float> %x) strictfp { ; CHECK-LABEL: trunc_v1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu @@ -148,13 +155,14 @@ declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, meta define <2 x float> @trunc_v2f32(<2 x float> %x) strictfp { ; CHECK-LABEL: trunc_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu @@ -168,13 +176,14 @@ declare <2 x float> @llvm.experimental.constrained.trunc.v2f32(<2 x float>, meta define <4 x float> @trunc_v4f32(<4 x float> %x) strictfp { ; CHECK-LABEL: trunc_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu @@ -188,13 +197,14 @@ declare <4 x float> @llvm.experimental.constrained.trunc.v4f32(<4 x float>, meta define <8 x float> @trunc_v8f32(<8 x float> %x) strictfp { ; CHECK-LABEL: trunc_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu @@ -208,13 +218,14 @@ declare <8 x float> @llvm.experimental.constrained.trunc.v8f32(<8 x float>, meta define <16 x float> @trunc_v16f32(<16 x float> %x) strictfp { ; CHECK-LABEL: trunc_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu @@ -228,13 +239,14 @@ declare <16 x float> @llvm.experimental.constrained.trunc.v16f32(<16 x float>, m define <1 x double> @trunc_v1f64(<1 x double> %x) strictfp { ; CHECK-LABEL: trunc_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -248,13 +260,14 @@ declare <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double>, me define <2 x double> @trunc_v2f64(<2 x double> %x) strictfp { ; CHECK-LABEL: trunc_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -268,13 +281,14 @@ declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, me define <4 x double> @trunc_v4f64(<4 x double> %x) strictfp { ; CHECK-LABEL: trunc_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -288,13 +302,14 @@ declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, me define <8 x double> @trunc_v8f64(<8 x double> %x) strictfp { ; CHECK-LABEL: trunc_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll index f88a9b3081a1a8..372937bb5ca5df 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll @@ -9,7 +9,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv1f16( @nearbyint_nxv1f16( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -17,6 +17,7 @@ define @nearbyint_nxv1f16( %v) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -32,7 +33,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv2f16( @nearbyint_nxv2f16( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -40,6 +41,7 @@ define @nearbyint_nxv2f16( %v) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -55,7 +57,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv4f16( @nearbyint_nxv4f16( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -63,6 +65,7 @@ define @nearbyint_nxv4f16( %v) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -78,7 +81,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv8f16( @nearbyint_nxv8f16( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -86,6 +89,7 @@ define @nearbyint_nxv8f16( %v) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -101,7 +105,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv16f16(< define @nearbyint_nxv16f16( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -109,6 +113,7 @@ define @nearbyint_nxv16f16( %v) strictf ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -124,7 +129,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv32f16(< define @nearbyint_nxv32f16( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) @@ -132,6 +137,7 @@ define @nearbyint_nxv32f16( %v) strictf ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu @@ -147,7 +153,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv1f32( @nearbyint_nxv1f32( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -155,6 +161,7 @@ define @nearbyint_nxv1f32( %v) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu @@ -170,7 +177,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv2f32( @nearbyint_nxv2f32( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -178,6 +185,7 @@ define @nearbyint_nxv2f32( %v) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu @@ -193,7 +201,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv4f32( @nearbyint_nxv4f32( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -201,6 +209,7 @@ define @nearbyint_nxv4f32( %v) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu @@ -216,7 +225,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv8f32( @nearbyint_nxv8f32( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -224,6 +233,7 @@ define @nearbyint_nxv8f32( %v) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu @@ -239,7 +249,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv16f32( define @nearbyint_nxv16f32( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v16, v8 @@ -247,6 +257,7 @@ define @nearbyint_nxv16f32( %v) stric ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu @@ -262,7 +273,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv1f64(< define @nearbyint_nxv1f64( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -270,6 +281,7 @@ define @nearbyint_nxv1f64( %v) strict ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -285,7 +297,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv2f64(< define @nearbyint_nxv2f64( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) @@ -293,6 +305,7 @@ define @nearbyint_nxv2f64( %v) strict ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -308,7 +321,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv4f64(< define @nearbyint_nxv4f64( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) @@ -316,6 +329,7 @@ define @nearbyint_nxv4f64( %v) strict ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -331,7 +345,7 @@ declare @llvm.experimental.constrained.nearbyint.nxv8f64(< define @nearbyint_nxv8f64( %v) strictfp { ; CHECK-LABEL: nearbyint_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) @@ -339,6 +353,7 @@ define @nearbyint_nxv8f64( %v) strict ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll index 3276f481f30ea5..aaa7a538e70fb7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll @@ -9,7 +9,7 @@ define @round_nxv1f16( %x) strictfp { ; CHECK-LABEL: round_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -17,6 +17,7 @@ define @round_nxv1f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -31,7 +32,7 @@ declare @llvm.experimental.constrained.round.nxv1f16( @round_nxv2f16( %x) strictfp { ; CHECK-LABEL: round_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -39,6 +40,7 @@ define @round_nxv2f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -53,7 +55,7 @@ declare @llvm.experimental.constrained.round.nxv2f16( @round_nxv4f16( %x) strictfp { ; CHECK-LABEL: round_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -61,6 +63,7 @@ define @round_nxv4f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -75,7 +78,7 @@ declare @llvm.experimental.constrained.round.nxv4f16( @round_nxv8f16( %x) strictfp { ; CHECK-LABEL: round_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -83,6 +86,7 @@ define @round_nxv8f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -97,7 +101,7 @@ declare @llvm.experimental.constrained.round.nxv8f16( @round_nxv16f16( %x) strictfp { ; CHECK-LABEL: round_nxv16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -105,6 +109,7 @@ define @round_nxv16f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -119,7 +124,7 @@ declare @llvm.experimental.constrained.round.nxv16f16( @round_nxv32f16( %x) strictfp { ; CHECK-LABEL: round_nxv32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) @@ -127,6 +132,7 @@ define @round_nxv32f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -141,7 +147,7 @@ declare @llvm.experimental.constrained.round.nxv32f16( @round_nxv1f32( %x) strictfp { ; CHECK-LABEL: round_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -149,6 +155,7 @@ define @round_nxv1f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -163,7 +170,7 @@ declare @llvm.experimental.constrained.round.nxv1f32( @round_nxv2f32( %x) strictfp { ; CHECK-LABEL: round_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -171,6 +178,7 @@ define @round_nxv2f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -185,7 +193,7 @@ declare @llvm.experimental.constrained.round.nxv2f32( @round_nxv4f32( %x) strictfp { ; CHECK-LABEL: round_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -193,6 +201,7 @@ define @round_nxv4f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -207,7 +216,7 @@ declare @llvm.experimental.constrained.round.nxv4f32( @round_nxv8f32( %x) strictfp { ; CHECK-LABEL: round_nxv8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -215,6 +224,7 @@ define @round_nxv8f32( %x) strictfp { ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -229,7 +239,7 @@ declare @llvm.experimental.constrained.round.nxv8f32( @round_nxv16f32( %x) strictfp { ; CHECK-LABEL: round_nxv16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v16, v8 @@ -237,6 +247,7 @@ define @round_nxv16f32( %x) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -251,7 +262,7 @@ declare @llvm.experimental.constrained.round.nxv16f32( @round_nxv1f64( %x) strictfp { ; CHECK-LABEL: round_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -259,6 +270,7 @@ define @round_nxv1f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -273,7 +285,7 @@ declare @llvm.experimental.constrained.round.nxv1f64( @round_nxv2f64( %x) strictfp { ; CHECK-LABEL: round_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) @@ -281,6 +293,7 @@ define @round_nxv2f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -295,7 +308,7 @@ declare @llvm.experimental.constrained.round.nxv2f64( @round_nxv4f64( %x) strictfp { ; CHECK-LABEL: round_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) @@ -303,6 +316,7 @@ define @round_nxv4f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -317,7 +331,7 @@ declare @llvm.experimental.constrained.round.nxv4f64( @round_nxv8f64( %x) strictfp { ; CHECK-LABEL: round_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) @@ -325,6 +339,7 @@ define @round_nxv8f64( %x) strictfp { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll index 4ebfcccbaaa6e6..cdc01d658778bc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll @@ -9,7 +9,7 @@ define @roundeven_nxv1f16( %x) strictfp { ; CHECK-LABEL: roundeven_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) @@ -17,6 +17,7 @@ define @roundeven_nxv1f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -31,7 +32,7 @@ declare @llvm.experimental.constrained.roundeven.nxv1f16( @roundeven_nxv2f16( %x) strictfp { ; CHECK-LABEL: roundeven_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) @@ -39,6 +40,7 @@ define @roundeven_nxv2f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -53,7 +55,7 @@ declare @llvm.experimental.constrained.roundeven.nxv2f16( @roundeven_nxv4f16( %x) strictfp { ; CHECK-LABEL: roundeven_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) @@ -61,6 +63,7 @@ define @roundeven_nxv4f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -75,7 +78,7 @@ declare @llvm.experimental.constrained.roundeven.nxv4f16( @roundeven_nxv8f16( %x) strictfp { ; CHECK-LABEL: roundeven_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) @@ -83,6 +86,7 @@ define @roundeven_nxv8f16( %x) strictfp { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -97,7 +101,7 @@ declare @llvm.experimental.constrained.roundeven.nxv8f16( @roundeven_nxv16f16( %x) strictfp { ; CHECK-LABEL: roundeven_nxv16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) @@ -105,6 +109,7 @@ define @roundeven_nxv16f16( %x) strictf ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -119,7 +124,7 @@ declare @llvm.experimental.constrained.roundeven.nxv16f16(< define @roundeven_nxv32f16( %x) strictfp { ; CHECK-LABEL: roundeven_nxv32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) @@ -127,6 +132,7 @@ define @roundeven_nxv32f16( %x) strictf ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -141,7 +147,7 @@ declare @llvm.experimental.constrained.roundeven.nxv32f16(< define @roundeven_nxv1f32( %x) strictfp { ; CHECK-LABEL: roundeven_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -149,6 +155,7 @@ define @roundeven_nxv1f32( %x) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -163,7 +170,7 @@ declare @llvm.experimental.constrained.roundeven.nxv1f32( @roundeven_nxv2f32( %x) strictfp { ; CHECK-LABEL: roundeven_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 @@ -171,6 +178,7 @@ define @roundeven_nxv2f32( %x) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -185,7 +193,7 @@ declare @llvm.experimental.constrained.roundeven.nxv2f32( @roundeven_nxv4f32( %x) strictfp { ; CHECK-LABEL: roundeven_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 @@ -193,6 +201,7 @@ define @roundeven_nxv4f32( %x) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -207,7 +216,7 @@ declare @llvm.experimental.constrained.roundeven.nxv4f32( @roundeven_nxv8f32( %x) strictfp { ; CHECK-LABEL: roundeven_nxv8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 @@ -215,6 +224,7 @@ define @roundeven_nxv8f32( %x) strictfp ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -229,7 +239,7 @@ declare @llvm.experimental.constrained.roundeven.nxv8f32( @roundeven_nxv16f32( %x) strictfp { ; CHECK-LABEL: roundeven_nxv16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v16, v8 @@ -237,6 +247,7 @@ define @roundeven_nxv16f32( %x) stric ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -251,7 +262,7 @@ declare @llvm.experimental.constrained.roundeven.nxv16f32( define @roundeven_nxv1f64( %x) strictfp { ; CHECK-LABEL: roundeven_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) @@ -259,6 +270,7 @@ define @roundeven_nxv1f64( %x) strict ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -273,7 +285,7 @@ declare @llvm.experimental.constrained.roundeven.nxv1f64(< define @roundeven_nxv2f64( %x) strictfp { ; CHECK-LABEL: roundeven_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) @@ -281,6 +293,7 @@ define @roundeven_nxv2f64( %x) strict ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -295,7 +308,7 @@ declare @llvm.experimental.constrained.roundeven.nxv2f64(< define @roundeven_nxv4f64( %x) strictfp { ; CHECK-LABEL: roundeven_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) @@ -303,6 +316,7 @@ define @roundeven_nxv4f64( %x) strict ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -317,7 +331,7 @@ declare @llvm.experimental.constrained.roundeven.nxv4f64(< define @roundeven_nxv8f64( %x) strictfp { ; CHECK-LABEL: roundeven_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) @@ -325,6 +339,7 @@ define @roundeven_nxv8f64( %x) strict ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll index 3665669d83a3d4..21615b516da898 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll @@ -7,13 +7,14 @@ define @trunc_nxv1f16( %x) strictfp { ; CHECK-LABEL: trunc_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -27,13 +28,14 @@ declare @llvm.experimental.constrained.trunc.nxv1f16( @trunc_nxv2f16( %x) strictfp { ; CHECK-LABEL: trunc_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -47,13 +49,14 @@ declare @llvm.experimental.constrained.trunc.nxv2f16( @trunc_nxv4f16( %x) strictfp { ; CHECK-LABEL: trunc_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -67,13 +70,14 @@ declare @llvm.experimental.constrained.trunc.nxv4f16( @trunc_nxv8f16( %x) strictfp { ; CHECK-LABEL: trunc_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -87,13 +91,14 @@ declare @llvm.experimental.constrained.trunc.nxv8f16( @trunc_nxv16f16( %x) strictfp { ; CHECK-LABEL: trunc_nxv16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -107,13 +112,14 @@ declare @llvm.experimental.constrained.trunc.nxv16f16( @trunc_nxv32f16( %x) strictfp { ; CHECK-LABEL: trunc_nxv32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu @@ -127,13 +133,14 @@ declare @llvm.experimental.constrained.trunc.nxv32f16( @trunc_nxv1f32( %x) strictfp { ; CHECK-LABEL: trunc_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu @@ -147,13 +154,14 @@ declare @llvm.experimental.constrained.trunc.nxv1f32( @trunc_nxv2f32( %x) strictfp { ; CHECK-LABEL: trunc_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu @@ -167,13 +175,14 @@ declare @llvm.experimental.constrained.trunc.nxv2f32( @trunc_nxv4f32( %x) strictfp { ; CHECK-LABEL: trunc_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu @@ -187,13 +196,14 @@ declare @llvm.experimental.constrained.trunc.nxv4f32( @trunc_nxv8f32( %x) strictfp { ; CHECK-LABEL: trunc_nxv8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu @@ -207,13 +217,14 @@ declare @llvm.experimental.constrained.trunc.nxv8f32( @trunc_nxv16f32( %x) strictfp { ; CHECK-LABEL: trunc_nxv16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu @@ -227,13 +238,14 @@ declare @llvm.experimental.constrained.trunc.nxv16f32( @trunc_nxv1f64( %x) strictfp { ; CHECK-LABEL: trunc_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -247,13 +259,14 @@ declare @llvm.experimental.constrained.trunc.nxv1f64( @trunc_nxv2f64( %x) strictfp { ; CHECK-LABEL: trunc_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -267,13 +280,14 @@ declare @llvm.experimental.constrained.trunc.nxv2f64( @trunc_nxv4f64( %x) strictfp { ; CHECK-LABEL: trunc_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -287,13 +301,14 @@ declare @llvm.experimental.constrained.trunc.nxv4f64( @trunc_nxv8f64( %x) strictfp { ; CHECK-LABEL: trunc_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu From f284af4863640e6b68918aa23b14498c1b8e2245 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 28 May 2024 10:50:28 -0500 Subject: [PATCH 25/89] [Offload][Fix] Fix lazy initialization with multiple images Summary: There was a bug here where we would initialize the plugin multiple times when there were multiple images. Fix it by putting the `is_initliaized` check later. --- offload/src/PluginManager.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp index f72007849e36e4..13f08b142b8769 100644 --- a/offload/src/PluginManager.cpp +++ b/offload/src/PluginManager.cpp @@ -155,11 +155,11 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) { // Initialize all the plugins that have associated images. for (auto &Plugin : Plugins) { - if (Plugin->is_initialized()) - continue; - // Extract the exectuable image and extra information if availible. for (int32_t i = 0; i < Desc->NumDeviceImages; ++i) { + if (Plugin->is_initialized()) + continue; + if (!Plugin->is_valid_binary(&Desc->DeviceImages[i], /*Initialized=*/false)) continue; From af22e274e9c5643780f25066442e05b5bd453328 Mon Sep 17 00:00:00 2001 From: Matthias Gehre Date: Tue, 28 May 2024 17:59:23 +0200 Subject: [PATCH 26/89] TosaToTensor: Support reshape on tensors of unsigned integer (#91734) This adds - `mlir::tosa::populateTosaToLinalgTypeConversion` which converts tensors of unsigned integers into tensors of signless integers - modifies the `tosa.reshape` lowering in TosaToTensor to use the type converter correctly I choose to implement the type converter in `mlir/Conversion/TosaToLinalg/TosaToLinalg.h` instead of `mlir/Conversion/TosaToTensor/TosaToTensor.h` because I need the same type converter in the TosaToLinalg lowerings (future PR). Alternatively, I could duplicate the type converter so it exists both in TosaToLinalg and TosaToTensor. Let me know if you prefer that. --- .../Conversion/TosaToTensor/TosaToTensor.h | 4 +- .../mlir/Dialect/Tosa/Transforms/Passes.h | 3 ++ .../Conversion/TosaToTensor/TosaToTensor.cpp | 33 +++++++----- .../TosaToTensor/TosaToTensorPass.cpp | 5 +- .../Dialect/Tosa/Transforms/CMakeLists.txt | 1 + .../Tosa/Transforms/TosaTypeConverters.cpp | 52 +++++++++++++++++++ .../TosaToTensor/tosa-to-tensor.mlir | 14 +++++ 7 files changed, 97 insertions(+), 15 deletions(-) create mode 100644 mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp diff --git a/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h b/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h index 3953c83f3aa106..76a4b1b1563366 100644 --- a/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h +++ b/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h @@ -16,6 +16,7 @@ #include "mlir/Pass/Pass.h" namespace mlir { +class TypeConverter; #define GEN_PASS_DECL_TOSATOTENSOR #include "mlir/Conversion/Passes.h.inc" @@ -24,7 +25,8 @@ namespace tosa { std::unique_ptr createTosaToTensor(); -void populateTosaToTensorConversionPatterns(RewritePatternSet *patterns); +void populateTosaToTensorConversionPatterns(TypeConverter &converter, + RewritePatternSet *patterns); } // namespace tosa } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h index fbfc56dfe2cf4f..1f9522b51a4cf5 100644 --- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h @@ -18,6 +18,7 @@ #include "mlir/Pass/Pass.h" namespace mlir { +class TypeConverter; namespace tosa { #define GEN_PASS_DECL @@ -38,6 +39,8 @@ void populateTosaConstantReduction(MLIRContext *ctx, RewritePatternSet &patterns, bool aggressiveReduceConstant); +void populateTosaTypeConversion(TypeConverter &converter); + std::unique_ptr createTosaLayerwiseConstantFoldPass(); std::unique_ptr createTosaLayerwiseConstantFoldPass( const TosaLayerwiseConstantFoldPassOptions &options); diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp index 89f956a5e70175..c0c015ab34aab0 100644 --- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp +++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp @@ -224,8 +224,17 @@ class ReshapeConverter : public OpConversionPattern { matchAndRewrite(tosa::ReshapeOp reshape, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const final { auto loc = reshape.getLoc(); - auto resultType = reshape.getResult().getType(); - auto input = reshape.getInput1(); + auto resultType = cast_if_present( + getTypeConverter()->convertType(reshape.getType())); + if (!resultType) { + return rewriter.notifyMatchFailure(reshape.getLoc(), + "could not convert result type"); + } + auto input = dyn_cast>(adaptor.getInput1()); + if (!input) { + return rewriter.notifyMatchFailure(reshape.getLoc(), + "expected input type to be tensor"); + } auto newShape = reshape.getNewShape(); // Infer all intermediate types @@ -288,12 +297,13 @@ class SliceConverter : public OpConversionPattern { } }; -class PadConverter : public OpRewritePattern { +class PadConverter : public OpConversionPattern { public: - using OpRewritePattern::OpRewritePattern; + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(tosa::PadOp padOp, - PatternRewriter &rewriter) const final { + LogicalResult + matchAndRewrite(tosa::PadOp padOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { auto loc = padOp.getLoc(); auto input = padOp.getInput1(); auto padding = padOp.getPadding(); @@ -428,11 +438,8 @@ struct ConcatConverter : public OpConversionPattern { } // namespace void mlir::tosa::populateTosaToTensorConversionPatterns( - RewritePatternSet *patterns) { - patterns->add< - ConcatConverter, - PadConverter, - ReshapeConverter, - SliceConverter - >(patterns->getContext()); + TypeConverter &converter, RewritePatternSet *patterns) { + patterns + ->add( + converter, patterns->getContext()); } diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp index 50dc55667fb94e..fa1c2cf7fba986 100644 --- a/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp +++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp @@ -42,7 +42,10 @@ struct TosaToTensor : public impl::TosaToTensorBase { target.addLegalDialect(); target.addLegalDialect(); - mlir::tosa::populateTosaToTensorConversionPatterns(&patterns); + TypeConverter converter; + mlir::tosa::populateTosaTypeConversion(converter); + + mlir::tosa::populateTosaToTensorConversionPatterns(converter, &patterns); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) diff --git a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt index 0e6510ba1e9255..c78a74b874aff1 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt @@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRTosaTransforms TosaLayerwiseConstantFoldPass.cpp TosaMakeBroadcastable.cpp TosaOptionalDecompositions.cpp + TosaTypeConverters.cpp TosaValidation.cpp ADDITIONAL_HEADER_DIRS diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp new file mode 100644 index 00000000000000..d2650de8cd7f02 --- /dev/null +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp @@ -0,0 +1,52 @@ + +//===- TosaTypeConverters.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Type converters for lowering TOSA to linalg/arith. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Tosa/Transforms/Passes.h" + +#include "mlir/Transforms/DialectConversion.h" + +using namespace mlir; + +void mlir::tosa::populateTosaTypeConversion(TypeConverter &converter) { + converter.addConversion([&](Type type) -> std::optional { + if (type.isUnsignedInteger()) { + return IntegerType::get(type.getContext(), type.getIntOrFloatBitWidth(), + IntegerType::SignednessSemantics::Signless); + } + return type; + }); + converter.addConversion([&](TensorType type) -> std::optional { + auto converted = converter.convertType(type.getElementType()); + if (!converted) + return {}; + return type.clone(converted); + }); + converter.addSourceMaterialization([&](OpBuilder &builder, Type resultType, + ValueRange inputs, + Location loc) -> std::optional { + if (inputs.size() != 1) + return std::nullopt; + + return builder.create(loc, resultType, inputs) + .getResult(0); + }); + converter.addTargetMaterialization([&](OpBuilder &builder, Type resultType, + ValueRange inputs, + Location loc) -> std::optional { + if (inputs.size() != 1) + return std::nullopt; + + return builder.create(loc, resultType, inputs) + .getResult(0); + }); +} diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir index 72e7e4cc840886..1e62e25176a007 100644 --- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir +++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir @@ -420,6 +420,20 @@ func.func @test_reshape_6d_down_s2s_explicit(%arg0: tensor<1x2x3x5x7x11xf32>) -> // ----- +// CHECK-LABEL: @test_reshape_samerank_unsigned +// CHECK-SAME: (%[[ARG0:.*]]: tensor<3x2xui8>) +func.func @test_reshape_samerank_unsigned(%arg0: tensor<3x2xui8>) -> tensor<2x3xui8> { + // CHECK-NEXT: %[[CAST1:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : tensor<3x2xui8> to tensor<3x2xi8> + // CHECK-NEXT: %[[RESHAPE1:.*]] = tensor.collapse_shape %[[CAST1]] {{\[}}[0, 1]] : tensor<3x2xi8> into tensor<6xi8> + // CHECK-NEXT: %[[RESHAPE2:.*]] = tensor.expand_shape %[[RESHAPE1]] {{\[}}[0, 1]] output_shape {{\[}}2, 3] : tensor<6xi8> into tensor<2x3xi8> + // CHECK-NEXT: %[[CAST2:.*]] = builtin.unrealized_conversion_cast %[[RESHAPE2]] : tensor<2x3xi8> to tensor<2x3xui8 + %0 = "tosa.reshape"(%arg0) {new_shape = array} : (tensor<3x2xui8>) -> tensor<2x3xui8> + // CHECK-NEXT: return %[[CAST2]] + return %0 : tensor<2x3xui8> +} + +// ----- + // CHECK-LABEL: func @slice func.func @slice(%arg0: tensor<6xf32>) ->() { // CHECK: [[SLICE:%.+]] = tensor.extract_slice %arg0[2] [1] [1] From fea7399e97b73a3209fcbe3338d412069769a637 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 May 2024 09:03:06 -0700 Subject: [PATCH 27/89] [clang] Fix a warning This patch fixes: clang/unittests/Interpreter/IncrementalProcessingTest.cpp:39:13: error: unused function 'HostSupportsJit' [-Werror,-Wunused-function] --- clang/unittests/Interpreter/IncrementalProcessingTest.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp index 54159173d91e39..f3b091b0c0e6cb 100644 --- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp +++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp @@ -36,14 +36,6 @@ using namespace clang; namespace { -static bool HostSupportsJit() { - auto J = llvm::orc::LLJITBuilder().create(); - if (J) - return true; - LLVMConsumeError(llvm::wrap(J.takeError())); - return false; -} - // Incremental processing produces several modules, all using the same "main // file". Make sure CodeGen can cope with that, e.g. for static initializers. const char TestProgram1[] = "extern \"C\" int funcForProg1() { return 17; }\n" From 273777ead296c9ab2c157d16b750e3ee1ace08ec Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 28 May 2024 12:04:44 -0400 Subject: [PATCH 28/89] clang:: to llvm::; NFC These interfaces are LLVM interfaces, not Clang ones; but this worked because of LLVM.h adding the interfaces to the clang namespace. --- clang/lib/AST/APValue.cpp | 2 +- clang/lib/Analysis/MacroExpansionContext.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/APValue.cpp b/clang/lib/AST/APValue.cpp index 8c77b563657d90..d8e33ff421c06c 100644 --- a/clang/lib/AST/APValue.cpp +++ b/clang/lib/AST/APValue.cpp @@ -90,7 +90,7 @@ QualType APValue::LValueBase::getType() const { // For a materialized temporary, the type of the temporary we materialized // may not be the type of the expression. if (const MaterializeTemporaryExpr *MTE = - clang::dyn_cast(Base)) { + llvm::dyn_cast(Base)) { SmallVector CommaLHSs; SmallVector Adjustments; const Expr *Temp = MTE->getSubExpr(); diff --git a/clang/lib/Analysis/MacroExpansionContext.cpp b/clang/lib/Analysis/MacroExpansionContext.cpp index 564e359668a510..b212b7f2457927 100644 --- a/clang/lib/Analysis/MacroExpansionContext.cpp +++ b/clang/lib/Analysis/MacroExpansionContext.cpp @@ -12,7 +12,7 @@ #define DEBUG_TYPE "macro-expansion-context" -static void dumpTokenInto(const clang::Preprocessor &PP, clang::raw_ostream &OS, +static void dumpTokenInto(const clang::Preprocessor &PP, llvm::raw_ostream &OS, clang::Token Tok); namespace clang { From 259caad2f75011174d39615bb0ba31955d16d498 Mon Sep 17 00:00:00 2001 From: Oleksandr T Date: Tue, 28 May 2024 19:08:38 +0300 Subject: [PATCH 29/89] [Clang] Fix an assertion failure when checking invalid `this` (#93490) Skip explicit this check in non-valid scopes due to `null` type in lambdas with invalid captures or incomplete parameter lists during parsing Fixes #91536 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaExprCXX.cpp | 6 +++--- clang/test/SemaCXX/invalid-this-in-lambda.cpp | 4 ++++ 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 clang/test/SemaCXX/invalid-this-in-lambda.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6b746cda53c71b..173e61fbf7b2c1 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -803,6 +803,7 @@ Bug Fixes to C++ Support with the same parameters not to be diagnosed. (Fixes #GH93456). - Clang now diagnoses unexpanded parameter packs in attributes. (Fixes #GH93269). - Clang now allows ``@$``` in raw string literals. Fixes (#GH93130). +- Fix an assertion failure when checking invalid ``this`` usage in the wrong context. (Fixes #GH91536). Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index d3e9dcb4f4399a..6595abbcdda5b1 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -1444,10 +1444,10 @@ bool Sema::CheckCXXThisType(SourceLocation Loc, QualType Type) { // category are defined within such member functions as they are within // an implicit object member function). DeclContext *DC = getFunctionLevelDeclContext(); - if (const auto *Method = dyn_cast(DC); - Method && Method->isExplicitObjectMemberFunction()) { + const auto *Method = dyn_cast(DC); + if (Method && Method->isExplicitObjectMemberFunction()) { Diag(Loc, diag::err_invalid_this_use) << 1; - } else if (isLambdaCallWithExplicitObjectParameter(CurContext)) { + } else if (Method && isLambdaCallWithExplicitObjectParameter(CurContext)) { Diag(Loc, diag::err_invalid_this_use) << 1; } else { Diag(Loc, diag::err_invalid_this_use) << 0; diff --git a/clang/test/SemaCXX/invalid-this-in-lambda.cpp b/clang/test/SemaCXX/invalid-this-in-lambda.cpp new file mode 100644 index 00000000000000..ae65bda025e232 --- /dev/null +++ b/clang/test/SemaCXX/invalid-this-in-lambda.cpp @@ -0,0 +1,4 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s + +decltype([]()->decltype(this) { }) a; // expected-error {{invalid use of 'this' outside of a non-static member function}} + From 234cc40adc610a55d1a5a2fe798a9dd07b993f0c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 28 May 2024 09:23:02 -0700 Subject: [PATCH 30/89] [LAA] Limit no-overlap check to at least one loop-invariant accesses. Limit the logic added in https://github.com/llvm/llvm-project/pull/9230 to cases where either sink or source are loop-invariant, to avoid compile-time increases. This is not needed for correctness. I am working on follow-up changes to reduce the compile-time impact in general to allow us to enable this again for any source/sink. This should fix the compile-time regression introduced by this change: * compile-time improvement with this change: https://llvm-compile-time-tracker.com/compare.php?from=4351787fb650da6d1bfb8d6e58753c90dcd4c418&to=b89010a2eb5f98494787c1c3b77f25208c59090c&stat=instructions:u * compile-time improvement with original patch reverted on top of this change: https://llvm-compile-time-tracker.com/compare.php?from=b89010a2eb5f98494787c1c3b77f25208c59090c&to=19a1103fe68115cfd7d6472c6961f4fabe81a593&stat=instructions:u --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 33 +++++++++++-------- .../LoopAccessAnalysis/depend_diff_types.ll | 10 +++++- .../non-constant-strides-backward.ll | 7 +++- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index bc8b9b8479e4ff..bd4c2a35ebf2cb 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1983,20 +1983,25 @@ getDependenceDistanceStrideAndSize( return MemoryDepChecker::Dependence::IndirectUnsafe; // Check if we can prove that Sink only accesses memory after Src's end or - // vice versa. - const auto &[SrcStart, SrcEnd] = - getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE); - const auto &[SinkStart, SinkEnd] = - getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE); - - if (!isa(SrcStart) && - !isa(SrcEnd) && - !isa(SinkStart) && - !isa(SinkEnd)) { - if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart)) - return MemoryDepChecker::Dependence::NoDep; - if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart)) - return MemoryDepChecker::Dependence::NoDep; + // vice versa. At the moment this is limited to cases where either source or + // sink are loop invariant to avoid compile-time increases. This is not + // required for correctness. + if (SE.isLoopInvariant(Src, InnermostLoop) || + SE.isLoopInvariant(Sink, InnermostLoop)) { + const auto &[SrcStart, SrcEnd] = + getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE); + const auto &[SinkStart, SinkEnd] = + getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE); + + if (!isa(SrcStart) && + !isa(SrcEnd) && + !isa(SinkStart) && + !isa(SinkEnd)) { + if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart)) + return MemoryDepChecker::Dependence::NoDep; + if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart)) + return MemoryDepChecker::Dependence::NoDep; + } } // Need accesses with constant strides and the same direction. We don't want diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll index 809b15b2004952..81d8b01fe7fb72 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll @@ -130,8 +130,16 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) { ; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence' ; CHECK-NEXT: loop: ; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding. +; CHECK-NEXT: Unknown data dependence. ; CHECK-NEXT: Dependences: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 -> +; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 -> +; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8 +; CHECK-EMPTY: ; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: ; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 -> ; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8 diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll index 845ff078ee0eb4..416742a94e0d36 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll @@ -45,8 +45,13 @@ exit: define void @different_non_constant_strides_known_backward_distance_larger_than_trip_count(ptr %A) { ; CHECK-LABEL: 'different_non_constant_strides_known_backward_distance_larger_than_trip_count' ; CHECK-NEXT: loop: -; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop +; CHECK-NEXT: Unknown data dependence. ; CHECK-NEXT: Dependences: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: %l = load i32, ptr %gep, align 4 -> +; CHECK-NEXT: store i32 %add, ptr %gep.mul.2, align 4 +; CHECK-EMPTY: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: ; CHECK-EMPTY: From d582958618cc5aeff58c512399bef7b263fedd59 Mon Sep 17 00:00:00 2001 From: AtariDreams Date: Tue, 28 May 2024 12:25:43 -0400 Subject: [PATCH 31/89] Revert "[Legalizer] Check full condition for UMIN and UMAX just like the code below does for SMIN and SMAX" (#93573) Reverts llvm/llvm-project#87932 --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index c04f7208c61f2a..d8b0f52ecf9e32 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3972,7 +3972,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { // target can override this with custom lowering and calling the // implementation functions. LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - if (LI.isLegalOrCustom({G_UMIN, Ty}) && LI.isLegalOrCustom({G_UMAX, Ty})) + if (LI.isLegalOrCustom({G_UMIN, Ty})) return lowerAddSubSatToMinMax(MI); return lowerAddSubSatToAddoSubo(MI); } From 42944e4600827738fae868f0df831fb2678be8b4 Mon Sep 17 00:00:00 2001 From: Miro Bucko Date: Tue, 28 May 2024 23:29:10 +0700 Subject: [PATCH 32/89] Add SBAddressRange and SBAddressRangeList to SB API (#92014) This adds new SB API calls and classes to allow a user of the SB API to obtain an address ranges from SBFunction and SBBlock. --- lldb/bindings/headers.swig | 2 + .../interface/SBAddressRangeDocstrings.i | 3 + .../interface/SBAddressRangeExtensions.i | 11 + .../interface/SBAddressRangeListDocstrings.i | 3 + .../interface/SBAddressRangeListExtensions.i | 29 ++ lldb/bindings/interfaces.swig | 6 + lldb/include/lldb/API/LLDB.h | 2 + lldb/include/lldb/API/SBAddress.h | 1 + lldb/include/lldb/API/SBAddressRange.h | 66 +++++ lldb/include/lldb/API/SBAddressRangeList.h | 54 ++++ lldb/include/lldb/API/SBBlock.h | 4 + lldb/include/lldb/API/SBDefines.h | 2 + lldb/include/lldb/API/SBFunction.h | 3 + lldb/include/lldb/API/SBStream.h | 2 + lldb/include/lldb/API/SBTarget.h | 1 + lldb/include/lldb/Core/AddressRange.h | 14 + lldb/include/lldb/Core/AddressRangeListImpl.h | 51 ++++ lldb/include/lldb/Symbol/Block.h | 2 + lldb/include/lldb/lldb-forward.h | 3 + lldb/source/API/CMakeLists.txt | 2 + lldb/source/API/SBAddressRange.cpp | 103 +++++++ lldb/source/API/SBAddressRangeList.cpp | 94 +++++++ lldb/source/API/SBBlock.cpp | 10 + lldb/source/API/SBFunction.cpp | 14 + lldb/source/Core/AddressRange.cpp | 43 +++ lldb/source/Core/AddressRangeListImpl.cpp | 50 ++++ lldb/source/Core/CMakeLists.txt | 1 + lldb/source/Symbol/Block.cpp | 16 ++ .../API/python_api/address_range/Makefile | 3 + .../address_range/TestAddressRange.py | 256 ++++++++++++++++++ .../API/python_api/address_range/main.cpp | 8 + 31 files changed, 859 insertions(+) create mode 100644 lldb/bindings/interface/SBAddressRangeDocstrings.i create mode 100644 lldb/bindings/interface/SBAddressRangeExtensions.i create mode 100644 lldb/bindings/interface/SBAddressRangeListDocstrings.i create mode 100644 lldb/bindings/interface/SBAddressRangeListExtensions.i create mode 100644 lldb/include/lldb/API/SBAddressRange.h create mode 100644 lldb/include/lldb/API/SBAddressRangeList.h create mode 100644 lldb/include/lldb/Core/AddressRangeListImpl.h create mode 100644 lldb/source/API/SBAddressRange.cpp create mode 100644 lldb/source/API/SBAddressRangeList.cpp create mode 100644 lldb/source/Core/AddressRangeListImpl.cpp create mode 100644 lldb/test/API/python_api/address_range/Makefile create mode 100644 lldb/test/API/python_api/address_range/TestAddressRange.py create mode 100644 lldb/test/API/python_api/address_range/main.cpp diff --git a/lldb/bindings/headers.swig b/lldb/bindings/headers.swig index ffdc3c31ec883a..c91504604b6ac6 100644 --- a/lldb/bindings/headers.swig +++ b/lldb/bindings/headers.swig @@ -8,6 +8,8 @@ %{ #include "lldb/lldb-public.h" #include "lldb/API/SBAddress.h" +#include "lldb/API/SBAddressRange.h" +#include "lldb/API/SBAddressRangeList.h" #include "lldb/API/SBAttachInfo.h" #include "lldb/API/SBBlock.h" #include "lldb/API/SBBreakpoint.h" diff --git a/lldb/bindings/interface/SBAddressRangeDocstrings.i b/lldb/bindings/interface/SBAddressRangeDocstrings.i new file mode 100644 index 00000000000000..650195704d73e6 --- /dev/null +++ b/lldb/bindings/interface/SBAddressRangeDocstrings.i @@ -0,0 +1,3 @@ +%feature("docstring", +"API clients can get address range information." +) lldb::SBAddressRange; diff --git a/lldb/bindings/interface/SBAddressRangeExtensions.i b/lldb/bindings/interface/SBAddressRangeExtensions.i new file mode 100644 index 00000000000000..31bcfcb64590bc --- /dev/null +++ b/lldb/bindings/interface/SBAddressRangeExtensions.i @@ -0,0 +1,11 @@ +%extend lldb::SBAddressRange { +#ifdef SWIGPYTHON + %pythoncode%{ + def __repr__(self): + import lldb + stream = lldb.SBStream() + self.GetDescription(stream, lldb.target if lldb.target else lldb.SBTarget()) + return stream.GetData() + %} +#endif +} diff --git a/lldb/bindings/interface/SBAddressRangeListDocstrings.i b/lldb/bindings/interface/SBAddressRangeListDocstrings.i new file mode 100644 index 00000000000000..e4b96b9ca59312 --- /dev/null +++ b/lldb/bindings/interface/SBAddressRangeListDocstrings.i @@ -0,0 +1,3 @@ +%feature("docstring", +"Represents a list of :py:class:`SBAddressRange`." +) lldb::SBAddressRangeList; diff --git a/lldb/bindings/interface/SBAddressRangeListExtensions.i b/lldb/bindings/interface/SBAddressRangeListExtensions.i new file mode 100644 index 00000000000000..e281a84d73d27d --- /dev/null +++ b/lldb/bindings/interface/SBAddressRangeListExtensions.i @@ -0,0 +1,29 @@ +%extend lldb::SBAddressRangeList { +#ifdef SWIGPYTHON + %pythoncode%{ + def __len__(self): + '''Return the number of address ranges in a lldb.SBAddressRangeList object.''' + return self.GetSize() + + def __iter__(self): + '''Iterate over all the address ranges in a lldb.SBAddressRangeList object.''' + return lldb_iter(self, 'GetSize', 'GetAddressRangeAtIndex') + + def __getitem__(self, idx): + '''Get the address range at a given index in an lldb.SBAddressRangeList object.''' + if not isinstance(idx, int): + raise TypeError("unsupported index type: %s" % type(idx)) + count = len(self) + if not (-count <= idx < count): + raise IndexError("list index out of range") + idx %= count + return self.GetAddressRangeAtIndex(idx) + + def __repr__(self): + import lldb + stream = lldb.SBStream() + self.GetDescription(stream, lldb.target if lldb.target else lldb.SBTarget()) + return stream.GetData() + %} +#endif +} diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig index 2a29a8dd7ef0b4..0953f4c72a9101 100644 --- a/lldb/bindings/interfaces.swig +++ b/lldb/bindings/interfaces.swig @@ -12,6 +12,8 @@ /* Docstrings for SB classes and methods */ %include "./interface/SBAddressDocstrings.i" +%include "./interface/SBAddressRangeDocstrings.i" +%include "./interface/SBAddressRangeListDocstrings.i" %include "./interface/SBAttachInfoDocstrings.i" %include "./interface/SBBlockDocstrings.i" %include "./interface/SBBreakpointDocstrings.i" @@ -86,6 +88,8 @@ /* API headers */ %include "lldb/API/SBAddress.h" +%include "lldb/API/SBAddressRange.h" +%include "lldb/API/SBAddressRangeList.h" %include "lldb/API/SBAttachInfo.h" %include "lldb/API/SBBlock.h" %include "lldb/API/SBBreakpoint.h" @@ -163,6 +167,8 @@ /* Extensions for SB classes */ %include "./interface/SBAddressExtensions.i" +%include "./interface/SBAddressRangeExtensions.i" +%include "./interface/SBAddressRangeListExtensions.i" %include "./interface/SBBlockExtensions.i" %include "./interface/SBBreakpointExtensions.i" %include "./interface/SBBreakpointListExtensions.i" diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h index b256544326a224..d8cc9f5067fe94 100644 --- a/lldb/include/lldb/API/LLDB.h +++ b/lldb/include/lldb/API/LLDB.h @@ -10,6 +10,8 @@ #define LLDB_API_LLDB_H #include "lldb/API/SBAddress.h" +#include "lldb/API/SBAddressRange.h" +#include "lldb/API/SBAddressRangeList.h" #include "lldb/API/SBAttachInfo.h" #include "lldb/API/SBBlock.h" #include "lldb/API/SBBreakpoint.h" diff --git a/lldb/include/lldb/API/SBAddress.h b/lldb/include/lldb/API/SBAddress.h index 5e5f355ccc390c..430dad4862dbff 100644 --- a/lldb/include/lldb/API/SBAddress.h +++ b/lldb/include/lldb/API/SBAddress.h @@ -86,6 +86,7 @@ class LLDB_API SBAddress { lldb::SBLineEntry GetLineEntry(); protected: + friend class SBAddressRange; friend class SBBlock; friend class SBBreakpoint; friend class SBBreakpointLocation; diff --git a/lldb/include/lldb/API/SBAddressRange.h b/lldb/include/lldb/API/SBAddressRange.h new file mode 100644 index 00000000000000..152bd82426af1c --- /dev/null +++ b/lldb/include/lldb/API/SBAddressRange.h @@ -0,0 +1,66 @@ +//===-- SBAddressRange.h ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_API_SBADDRESSRANGE_H +#define LLDB_API_SBADDRESSRANGE_H + +#include "lldb/API/SBDefines.h" + +namespace lldb { + +class LLDB_API SBAddressRange { +public: + SBAddressRange(); + + SBAddressRange(const lldb::SBAddressRange &rhs); + + SBAddressRange(lldb::SBAddress addr, lldb::addr_t byte_size); + + ~SBAddressRange(); + + const lldb::SBAddressRange &operator=(const lldb::SBAddressRange &rhs); + + void Clear(); + + /// Check the address range refers to a valid base address and has a byte + /// size greater than zero. + /// + /// \return + /// True if the address range is valid, false otherwise. + bool IsValid() const; + + /// Get the base address of the range. + /// + /// \return + /// Base address object. + lldb::SBAddress GetBaseAddress() const; + + /// Get the byte size of this range. + /// + /// \return + /// The size in bytes of this address range. + lldb::addr_t GetByteSize() const; + + bool operator==(const SBAddressRange &rhs); + + bool operator!=(const SBAddressRange &rhs); + + bool GetDescription(lldb::SBStream &description, const SBTarget target); + +private: + friend class SBAddressRangeList; + friend class SBBlock; + friend class SBFunction; + friend class SBProcess; + + AddressRangeUP m_opaque_up; +}; + +} // namespace lldb + +#endif // LLDB_API_SBADDRESSRANGE_H diff --git a/lldb/include/lldb/API/SBAddressRangeList.h b/lldb/include/lldb/API/SBAddressRangeList.h new file mode 100644 index 00000000000000..a123287ef1b4fa --- /dev/null +++ b/lldb/include/lldb/API/SBAddressRangeList.h @@ -0,0 +1,54 @@ +//===-- SBAddressRangeList.h ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_API_SBADDRESSRANGELIST_H +#define LLDB_API_SBADDRESSRANGELIST_H + +#include + +#include "lldb/API/SBDefines.h" + +namespace lldb_private { +class AddressRangeListImpl; +} + +namespace lldb { + +class LLDB_API SBAddressRangeList { +public: + SBAddressRangeList(); + + SBAddressRangeList(const lldb::SBAddressRangeList &rhs); + + ~SBAddressRangeList(); + + const lldb::SBAddressRangeList & + operator=(const lldb::SBAddressRangeList &rhs); + + uint32_t GetSize() const; + + void Clear(); + + SBAddressRange GetAddressRangeAtIndex(uint64_t idx); + + void Append(const lldb::SBAddressRange &addr_range); + + void Append(const lldb::SBAddressRangeList &addr_range_list); + + bool GetDescription(lldb::SBStream &description, const SBTarget &target); + +private: + friend class SBBlock; + friend class SBProcess; + + std::unique_ptr m_opaque_up; +}; + +} // namespace lldb + +#endif // LLDB_API_SBADDRESSRANGELIST_H diff --git a/lldb/include/lldb/API/SBBlock.h b/lldb/include/lldb/API/SBBlock.h index 2570099f7652f3..de4bb22be26925 100644 --- a/lldb/include/lldb/API/SBBlock.h +++ b/lldb/include/lldb/API/SBBlock.h @@ -9,6 +9,8 @@ #ifndef LLDB_API_SBBLOCK_H #define LLDB_API_SBBLOCK_H +#include "lldb/API/SBAddressRange.h" +#include "lldb/API/SBAddressRangeList.h" #include "lldb/API/SBDefines.h" #include "lldb/API/SBFrame.h" #include "lldb/API/SBTarget.h" @@ -52,6 +54,8 @@ class LLDB_API SBBlock { lldb::SBAddress GetRangeEndAddress(uint32_t idx); + lldb::SBAddressRangeList GetRanges(); + uint32_t GetRangeIndexForBlockAddress(lldb::SBAddress block_addr); lldb::SBValueList GetVariables(lldb::SBFrame &frame, bool arguments, diff --git a/lldb/include/lldb/API/SBDefines.h b/lldb/include/lldb/API/SBDefines.h index 1181920677b46f..87c0a1c3661ca3 100644 --- a/lldb/include/lldb/API/SBDefines.h +++ b/lldb/include/lldb/API/SBDefines.h @@ -43,6 +43,8 @@ namespace lldb { class LLDB_API SBAddress; +class LLDB_API SBAddressRange; +class LLDB_API SBAddressRangeList; class LLDB_API SBAttachInfo; class LLDB_API SBBlock; class LLDB_API SBBreakpoint; diff --git a/lldb/include/lldb/API/SBFunction.h b/lldb/include/lldb/API/SBFunction.h index 71b372a818e4b5..df607fdc7ebf59 100644 --- a/lldb/include/lldb/API/SBFunction.h +++ b/lldb/include/lldb/API/SBFunction.h @@ -10,6 +10,7 @@ #define LLDB_API_SBFUNCTION_H #include "lldb/API/SBAddress.h" +#include "lldb/API/SBAddressRangeList.h" #include "lldb/API/SBDefines.h" #include "lldb/API/SBInstructionList.h" @@ -44,6 +45,8 @@ class LLDB_API SBFunction { lldb::SBAddress GetEndAddress(); + lldb::SBAddressRangeList GetRanges(); + const char *GetArgumentName(uint32_t arg_idx); uint32_t GetPrologueByteSize(); diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h index 0e33f05b69916f..71caf41fd75491 100644 --- a/lldb/include/lldb/API/SBStream.h +++ b/lldb/include/lldb/API/SBStream.h @@ -62,6 +62,8 @@ class LLDB_API SBStream { protected: friend class SBAddress; + friend class SBAddressRange; + friend class SBAddressRangeList; friend class SBBlock; friend class SBBreakpoint; friend class SBBreakpointLocation; diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h index feeaa1cb71132b..35c2ed9c20a238 100644 --- a/lldb/include/lldb/API/SBTarget.h +++ b/lldb/include/lldb/API/SBTarget.h @@ -943,6 +943,7 @@ class LLDB_API SBTarget { protected: friend class SBAddress; + friend class SBAddressRange; friend class SBBlock; friend class SBBreakpoint; friend class SBBreakpointList; diff --git a/lldb/include/lldb/Core/AddressRange.h b/lldb/include/lldb/Core/AddressRange.h index 4a33c2d7958765..68a3ad0edd2d79 100644 --- a/lldb/include/lldb/Core/AddressRange.h +++ b/lldb/include/lldb/Core/AddressRange.h @@ -86,6 +86,8 @@ class AddressRange { /// (LLDB_INVALID_ADDRESS) and a zero byte size. void Clear(); + bool IsValid() const; + /// Check if a section offset address is contained in this range. /// /// \param[in] so_addr @@ -236,12 +238,24 @@ class AddressRange { /// The new size in bytes of this address range. void SetByteSize(lldb::addr_t byte_size) { m_byte_size = byte_size; } + bool GetDescription(Stream *s, Target *target) const; + + bool operator==(const AddressRange &rhs); + + bool operator!=(const AddressRange &rhs); + protected: // Member variables Address m_base_addr; ///< The section offset base address of this range. lldb::addr_t m_byte_size = 0; ///< The size in bytes of this address range. }; +// Forward-declarable wrapper. +class AddressRanges : public std::vector { +public: + using std::vector::vector; +}; + } // namespace lldb_private #endif // LLDB_CORE_ADDRESSRANGE_H diff --git a/lldb/include/lldb/Core/AddressRangeListImpl.h b/lldb/include/lldb/Core/AddressRangeListImpl.h new file mode 100644 index 00000000000000..46ebfe73d4d92d --- /dev/null +++ b/lldb/include/lldb/Core/AddressRangeListImpl.h @@ -0,0 +1,51 @@ +//===-- AddressRangeListImpl.h ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_CORE_ADDRESSRANGELISTIMPL_H +#define LLDB_CORE_ADDRESSRANGELISTIMPL_H + +#include "lldb/Core/AddressRange.h" +#include + +namespace lldb { +class SBBlock; +} + +namespace lldb_private { + +class AddressRangeListImpl { +public: + AddressRangeListImpl(); + + AddressRangeListImpl(const AddressRangeListImpl &rhs) = default; + + AddressRangeListImpl &operator=(const AddressRangeListImpl &rhs); + + size_t GetSize() const; + + void Reserve(size_t capacity); + + void Append(const AddressRange &sb_region); + + void Append(const AddressRangeListImpl &list); + + void Clear(); + + lldb_private::AddressRange GetAddressRangeAtIndex(size_t index); + +private: + friend class lldb::SBBlock; + + AddressRanges &ref(); + + AddressRanges m_ranges; +}; + +} // namespace lldb_private + +#endif // LLDB_CORE_ADDRESSRANGE_H diff --git a/lldb/include/lldb/Symbol/Block.h b/lldb/include/lldb/Symbol/Block.h index 02fd2add531033..c9c4d5ad767d7e 100644 --- a/lldb/include/lldb/Symbol/Block.h +++ b/lldb/include/lldb/Symbol/Block.h @@ -355,6 +355,8 @@ class Block : public UserID, public SymbolContextScope { // be able to get at any of the address ranges in a block. bool GetRangeAtIndex(uint32_t range_idx, AddressRange &range); + AddressRanges GetRanges(); + bool GetStartAddress(Address &addr); void SetDidParseVariables(bool b, bool set_children); diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h index 10ba921b9dac8c..6d880b4da03c99 100644 --- a/lldb/include/lldb/lldb-forward.h +++ b/lldb/include/lldb/lldb-forward.h @@ -19,6 +19,8 @@ class ASTResultSynthesizer; class ASTStructExtractor; class Address; class AddressRange; +class AddressRanges; +class AddressRangeList; class AddressResolver; class ArchSpec; class Architecture; @@ -308,6 +310,7 @@ template class StreamBuffer; namespace lldb { typedef std::shared_ptr ABISP; +typedef std::unique_ptr AddressRangeUP; typedef std::shared_ptr BatonSP; typedef std::shared_ptr BlockSP; typedef std::shared_ptr BreakpointSP; diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt index e8228afe103f9c..63971016093151 100644 --- a/lldb/source/API/CMakeLists.txt +++ b/lldb/source/API/CMakeLists.txt @@ -42,6 +42,8 @@ set_target_properties(lldb-sbapi-dwarf-enums PROPERTIES FOLDER "LLDB/Tablegennin add_lldb_library(liblldb SHARED ${option_framework} SBAddress.cpp + SBAddressRange.cpp + SBAddressRangeList.cpp SBAttachInfo.cpp SBBlock.cpp SBBreakpoint.cpp diff --git a/lldb/source/API/SBAddressRange.cpp b/lldb/source/API/SBAddressRange.cpp new file mode 100644 index 00000000000000..9b1affdade439c --- /dev/null +++ b/lldb/source/API/SBAddressRange.cpp @@ -0,0 +1,103 @@ +//===-- SBAddressRange.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/API/SBAddressRange.h" +#include "Utils.h" +#include "lldb/API/SBAddress.h" +#include "lldb/API/SBStream.h" +#include "lldb/API/SBTarget.h" +#include "lldb/Core/AddressRange.h" +#include "lldb/Core/Section.h" +#include "lldb/Utility/Instrumentation.h" +#include "lldb/Utility/Stream.h" +#include +#include + +using namespace lldb; +using namespace lldb_private; + +SBAddressRange::SBAddressRange() + : m_opaque_up(std::make_unique()) { + LLDB_INSTRUMENT_VA(this); +} + +SBAddressRange::SBAddressRange(const SBAddressRange &rhs) { + LLDB_INSTRUMENT_VA(this, rhs); + + m_opaque_up = clone(rhs.m_opaque_up); +} + +SBAddressRange::SBAddressRange(lldb::SBAddress addr, lldb::addr_t byte_size) + : m_opaque_up(std::make_unique(addr.ref(), byte_size)) { + LLDB_INSTRUMENT_VA(this, addr, byte_size); +} + +SBAddressRange::~SBAddressRange() = default; + +const SBAddressRange &SBAddressRange::operator=(const SBAddressRange &rhs) { + LLDB_INSTRUMENT_VA(this, rhs); + + if (this != &rhs) + m_opaque_up = clone(rhs.m_opaque_up); + return *this; +} + +bool SBAddressRange::operator==(const SBAddressRange &rhs) { + LLDB_INSTRUMENT_VA(this, rhs); + + if (!IsValid() || !rhs.IsValid()) + return false; + return m_opaque_up->operator==(*(rhs.m_opaque_up)); +} + +bool SBAddressRange::operator!=(const SBAddressRange &rhs) { + LLDB_INSTRUMENT_VA(this, rhs); + + return !(*this == rhs); +} + +void SBAddressRange::Clear() { + LLDB_INSTRUMENT_VA(this); + + m_opaque_up.reset(); +} + +bool SBAddressRange::IsValid() const { + LLDB_INSTRUMENT_VA(this); + + return m_opaque_up && m_opaque_up->IsValid(); +} + +lldb::SBAddress SBAddressRange::GetBaseAddress() const { + LLDB_INSTRUMENT_VA(this); + + if (!IsValid()) + return lldb::SBAddress(); + return lldb::SBAddress(m_opaque_up->GetBaseAddress()); +} + +lldb::addr_t SBAddressRange::GetByteSize() const { + LLDB_INSTRUMENT_VA(this); + + if (!IsValid()) + return 0; + return m_opaque_up->GetByteSize(); +} + +bool SBAddressRange::GetDescription(SBStream &description, + const SBTarget target) { + LLDB_INSTRUMENT_VA(this, description, target); + + Stream &stream = description.ref(); + if (!IsValid()) { + stream << ""; + return true; + } + m_opaque_up->GetDescription(&stream, target.GetSP().get()); + return true; +} diff --git a/lldb/source/API/SBAddressRangeList.cpp b/lldb/source/API/SBAddressRangeList.cpp new file mode 100644 index 00000000000000..20660b3ff20882 --- /dev/null +++ b/lldb/source/API/SBAddressRangeList.cpp @@ -0,0 +1,94 @@ +//===-- SBAddressRangeList.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/API/SBAddressRangeList.h" +#include "Utils.h" +#include "lldb/API/SBAddressRange.h" +#include "lldb/API/SBStream.h" +#include "lldb/API/SBTarget.h" +#include "lldb/Core/AddressRangeListImpl.h" +#include "lldb/Utility/Instrumentation.h" +#include "lldb/Utility/Stream.h" + +#include + +using namespace lldb; +using namespace lldb_private; + +SBAddressRangeList::SBAddressRangeList() + : m_opaque_up(std::make_unique()) { + LLDB_INSTRUMENT_VA(this); +} + +SBAddressRangeList::SBAddressRangeList(const SBAddressRangeList &rhs) + : m_opaque_up(std::make_unique(*rhs.m_opaque_up)) { + LLDB_INSTRUMENT_VA(this, rhs); +} + +SBAddressRangeList::~SBAddressRangeList() = default; + +const SBAddressRangeList & +SBAddressRangeList::operator=(const SBAddressRangeList &rhs) { + LLDB_INSTRUMENT_VA(this, rhs); + + if (this != &rhs) + *m_opaque_up = *rhs.m_opaque_up; + return *this; +} + +uint32_t SBAddressRangeList::GetSize() const { + LLDB_INSTRUMENT_VA(this); + + return m_opaque_up->GetSize(); +} + +SBAddressRange SBAddressRangeList::GetAddressRangeAtIndex(uint64_t idx) { + LLDB_INSTRUMENT_VA(this, idx); + + SBAddressRange sb_addr_range; + (*sb_addr_range.m_opaque_up) = m_opaque_up->GetAddressRangeAtIndex(idx); + return sb_addr_range; +} + +void SBAddressRangeList::Clear() { + LLDB_INSTRUMENT_VA(this); + + m_opaque_up->Clear(); +} + +void SBAddressRangeList::Append(const SBAddressRange &sb_addr_range) { + LLDB_INSTRUMENT_VA(this, sb_addr_range); + + m_opaque_up->Append(*sb_addr_range.m_opaque_up); +} + +void SBAddressRangeList::Append(const SBAddressRangeList &sb_addr_range_list) { + LLDB_INSTRUMENT_VA(this, sb_addr_range_list); + + m_opaque_up->Append(*sb_addr_range_list.m_opaque_up); +} + +bool SBAddressRangeList::GetDescription(SBStream &description, + const SBTarget &target) { + LLDB_INSTRUMENT_VA(this, description, target); + + const uint32_t num_ranges = GetSize(); + bool is_first = true; + Stream &stream = description.ref(); + stream << "["; + for (uint32_t i = 0; i < num_ranges; ++i) { + if (is_first) { + is_first = false; + } else { + stream.Printf(", "); + } + GetAddressRangeAtIndex(i).GetDescription(description, target); + } + stream << "]"; + return true; +} diff --git a/lldb/source/API/SBBlock.cpp b/lldb/source/API/SBBlock.cpp index 7d7565340836b1..2577b14920f065 100644 --- a/lldb/source/API/SBBlock.cpp +++ b/lldb/source/API/SBBlock.cpp @@ -13,6 +13,7 @@ #include "lldb/API/SBStream.h" #include "lldb/API/SBValue.h" #include "lldb/Core/AddressRange.h" +#include "lldb/Core/AddressRangeListImpl.h" #include "lldb/Core/ValueObjectVariable.h" #include "lldb/Symbol/Block.h" #include "lldb/Symbol/Function.h" @@ -219,6 +220,15 @@ lldb::SBAddress SBBlock::GetRangeEndAddress(uint32_t idx) { return sb_addr; } +lldb::SBAddressRangeList SBBlock::GetRanges() { + LLDB_INSTRUMENT_VA(this); + + lldb::SBAddressRangeList sb_ranges; + if (m_opaque_ptr) + sb_ranges.m_opaque_up->ref() = m_opaque_ptr->GetRanges(); + return sb_ranges; +} + uint32_t SBBlock::GetRangeIndexForBlockAddress(lldb::SBAddress block_addr) { LLDB_INSTRUMENT_VA(this, block_addr); diff --git a/lldb/source/API/SBFunction.cpp b/lldb/source/API/SBFunction.cpp index a01c7f79bbd31f..6a97352fc2c2fd 100644 --- a/lldb/source/API/SBFunction.cpp +++ b/lldb/source/API/SBFunction.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "lldb/API/SBFunction.h" +#include "lldb/API/SBAddressRange.h" #include "lldb/API/SBProcess.h" #include "lldb/API/SBStream.h" #include "lldb/Core/Disassembler.h" @@ -160,6 +161,19 @@ SBAddress SBFunction::GetEndAddress() { return addr; } +lldb::SBAddressRangeList SBFunction::GetRanges() { + LLDB_INSTRUMENT_VA(this); + + lldb::SBAddressRangeList ranges; + if (m_opaque_ptr) { + lldb::SBAddressRange range; + (*range.m_opaque_up) = m_opaque_ptr->GetAddressRange(); + ranges.Append(std::move(range)); + } + + return ranges; +} + const char *SBFunction::GetArgumentName(uint32_t arg_idx) { LLDB_INSTRUMENT_VA(this, arg_idx); diff --git a/lldb/source/Core/AddressRange.cpp b/lldb/source/Core/AddressRange.cpp index 1830f2ccd47fec..6cef7e149cd20b 100644 --- a/lldb/source/Core/AddressRange.cpp +++ b/lldb/source/Core/AddressRange.cpp @@ -14,6 +14,7 @@ #include "lldb/Utility/FileSpec.h" #include "lldb/Utility/Stream.h" #include "lldb/lldb-defines.h" +#include "lldb/lldb-types.h" #include "llvm/Support/Compiler.h" @@ -145,6 +146,10 @@ void AddressRange::Clear() { m_byte_size = 0; } +bool AddressRange::IsValid() const { + return m_base_addr.IsValid() && (m_byte_size > 0); +} + bool AddressRange::Dump(Stream *s, Target *target, Address::DumpStyle style, Address::DumpStyle fallback_style) const { addr_t vmaddr = LLDB_INVALID_ADDRESS; @@ -203,3 +208,41 @@ void AddressRange::DumpDebug(Stream *s) const { static_cast(m_base_addr.GetSection().get()), m_base_addr.GetOffset(), GetByteSize()); } + +bool AddressRange::GetDescription(Stream *s, Target *target) const { + addr_t start_addr = m_base_addr.GetLoadAddress(target); + if (start_addr != LLDB_INVALID_ADDRESS) { + // We have a valid target and the address was resolved, or we have a base + // address with no section. Just print out a raw address range: [, + // ) + s->Printf("[0x%" PRIx64 "-0x%" PRIx64 ")", start_addr, + start_addr + GetByteSize()); + return true; + } + + // Either no target or the address wasn't resolved, print as + // [-) + const char *file_name = ""; + const auto section_sp = m_base_addr.GetSection(); + if (section_sp) { + if (const auto object_file = section_sp->GetObjectFile()) + file_name = object_file->GetFileSpec().GetFilename().AsCString(); + } + start_addr = m_base_addr.GetFileAddress(); + const addr_t end_addr = (start_addr == LLDB_INVALID_ADDRESS) + ? LLDB_INVALID_ADDRESS + : start_addr + GetByteSize(); + s->Printf("%s[0x%" PRIx64 "-0x%" PRIx64 ")", file_name, start_addr, end_addr); + return true; +} + +bool AddressRange::operator==(const AddressRange &rhs) { + if (!IsValid() || !rhs.IsValid()) + return false; + return m_base_addr == rhs.GetBaseAddress() && + m_byte_size == rhs.GetByteSize(); +} + +bool AddressRange::operator!=(const AddressRange &rhs) { + return !(*this == rhs); +} diff --git a/lldb/source/Core/AddressRangeListImpl.cpp b/lldb/source/Core/AddressRangeListImpl.cpp new file mode 100644 index 00000000000000..d405cf0fa3ec35 --- /dev/null +++ b/lldb/source/Core/AddressRangeListImpl.cpp @@ -0,0 +1,50 @@ +//===-- AddressRangeListImpl.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Core/AddressRangeListImpl.h" + +using namespace lldb; +using namespace lldb_private; + +AddressRangeListImpl::AddressRangeListImpl() : m_ranges() {} + +AddressRangeListImpl & +AddressRangeListImpl::operator=(const AddressRangeListImpl &rhs) { + if (this == &rhs) + return *this; + m_ranges = rhs.m_ranges; + return *this; +} + +size_t AddressRangeListImpl::GetSize() const { return m_ranges.size(); } + +void AddressRangeListImpl::Reserve(size_t capacity) { + m_ranges.reserve(capacity); +} + +void AddressRangeListImpl::Append(const AddressRange &sb_region) { + m_ranges.emplace_back(sb_region); +} + +void AddressRangeListImpl::Append(const AddressRangeListImpl &list) { + Reserve(GetSize() + list.GetSize()); + + for (const auto &range : list.m_ranges) + Append(range); +} + +void AddressRangeListImpl::Clear() { m_ranges.clear(); } + +lldb_private::AddressRange +AddressRangeListImpl::GetAddressRangeAtIndex(size_t index) { + if (index >= GetSize()) + return AddressRange(); + return m_ranges[index]; +} + +AddressRanges &AddressRangeListImpl::ref() { return m_ranges; } diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt index f24dbbd45a8e8c..dbc620b91b1ed1 100644 --- a/lldb/source/Core/CMakeLists.txt +++ b/lldb/source/Core/CMakeLists.txt @@ -20,6 +20,7 @@ endif() add_lldb_library(lldbCore Address.cpp AddressRange.cpp + AddressRangeListImpl.cpp AddressResolver.cpp AddressResolverFileLine.cpp Communication.cpp diff --git a/lldb/source/Symbol/Block.cpp b/lldb/source/Symbol/Block.cpp index 6eeabe0ff5e4d0..f7d9c0d2d33065 100644 --- a/lldb/source/Symbol/Block.cpp +++ b/lldb/source/Symbol/Block.cpp @@ -314,6 +314,22 @@ bool Block::GetRangeAtIndex(uint32_t range_idx, AddressRange &range) { return false; } +AddressRanges Block::GetRanges() { + AddressRanges ranges; + Function *function = CalculateSymbolContextFunction(); + if (!function) + return ranges; + for (size_t i = 0, e = m_ranges.GetSize(); i < e; ++i) { + ranges.emplace_back(); + auto &range = ranges.back(); + const Range &vm_range = m_ranges.GetEntryRef(i); + range.GetBaseAddress() = function->GetAddressRange().GetBaseAddress(); + range.GetBaseAddress().Slide(vm_range.GetRangeBase()); + range.SetByteSize(vm_range.GetByteSize()); + } + return ranges; +} + bool Block::GetStartAddress(Address &addr) { if (m_ranges.IsEmpty()) return false; diff --git a/lldb/test/API/python_api/address_range/Makefile b/lldb/test/API/python_api/address_range/Makefile new file mode 100644 index 00000000000000..99998b20bcb050 --- /dev/null +++ b/lldb/test/API/python_api/address_range/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/python_api/address_range/TestAddressRange.py b/lldb/test/API/python_api/address_range/TestAddressRange.py new file mode 100644 index 00000000000000..8c27558af4752d --- /dev/null +++ b/lldb/test/API/python_api/address_range/TestAddressRange.py @@ -0,0 +1,256 @@ +""" +Test SBAddressRange APIs. +""" + +import lldb +from lldbsuite.test.lldbtest import * + + +class AddressRangeTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def setUp(self): + TestBase.setUp(self) + + self.build() + exe = self.getBuildArtifact("a.out") + + self.dbg.SetAsync(True) + + self.target = self.dbg.CreateTarget(exe) + self.assertTrue(self.target, VALID_TARGET) + self.launch_info = self.target.GetLaunchInfo() + self.launch_info.SetWorkingDirectory(self.get_process_working_directory()) + + self.bp1 = self.target.BreakpointCreateByName("main", "a.out") + self.bp2 = self.target.BreakpointCreateByName("foo", "a.out") + self.bp3 = self.target.BreakpointCreateByName("bar", "a.out") + + self.assertTrue(self.bp1.IsValid()) + self.assertTrue(self.bp2.IsValid()) + self.assertTrue(self.bp3.IsValid()) + + self.addr1 = self.bp1.GetLocationAtIndex(0).GetAddress() + self.addr2 = self.bp2.GetLocationAtIndex(0).GetAddress() + self.addr3 = self.bp3.GetLocationAtIndex(0).GetAddress() + + self.assertTrue(self.addr1.IsValid()) + self.assertTrue(self.addr2.IsValid()) + self.assertTrue(self.addr3.IsValid()) + + def test_address_range_default(self): + """Testing default constructor.""" + empty_range = lldb.SBAddressRange() + self.assertEqual(empty_range.IsValid(), False) + + def test_address_range_construction(self): + """Make sure the construction and getters work.""" + range = lldb.SBAddressRange(self.addr1, 8) + self.assertEqual(range.IsValid(), True) + self.assertEqual(range.GetBaseAddress(), self.addr1) + self.assertEqual(range.GetByteSize(), 8) + + def test_address_range_clear(self): + """Make sure the clear method works.""" + range = lldb.SBAddressRange(self.addr1, 8) + self.assertEqual(range.IsValid(), True) + self.assertEqual(range.GetBaseAddress(), self.addr1) + self.assertEqual(range.GetByteSize(), 8) + + range.Clear() + self.assertEqual(range.IsValid(), False) + + def test_function(self): + """Make sure the range works in SBFunction APIs.""" + + # Setup breakpoints in main + loc = self.bp1.GetLocationAtIndex(0) + loc_addr = loc.GetAddress() + func = loc_addr.GetFunction() + ranges = func.GetRanges() + self.assertEqual(ranges.GetSize(), 1) + + range = ranges.GetAddressRangeAtIndex(0) + self.assertEqual( + range.GetByteSize(), + func.GetEndAddress().GetOffset() - func.GetStartAddress().GetOffset(), + ) + self.assertEqual( + range.GetBaseAddress().GetOffset(), + func.GetStartAddress().GetOffset(), + ) + + def test_block(self): + """Make sure the range works in SBBlock APIs.""" + loc = self.bp1.GetLocationAtIndex(0) + loc_addr = loc.GetAddress() + block = loc_addr.GetBlock() + + ranges = block.GetRanges() + self.assertEqual(ranges.GetSize(), 1) + + range = ranges.GetAddressRangeAtIndex(0) + self.assertEqual( + range.GetByteSize(), + block.GetRangeEndAddress(0).GetOffset() + - block.GetRangeStartAddress(0).GetOffset(), + ) + self.assertEqual( + range.GetBaseAddress().GetOffset(), + block.GetRangeStartAddress(0).GetOffset(), + ) + + def test_address_range_list(self): + """Make sure the SBAddressRangeList works by adding and getting ranges.""" + range1 = lldb.SBAddressRange(self.addr1, 8) + range2 = lldb.SBAddressRange(self.addr2, 16) + range3 = lldb.SBAddressRange(self.addr3, 32) + + range_list = lldb.SBAddressRangeList() + self.assertEqual(range_list.GetSize(), 0) + + range_list.Append(range1) + range_list.Append(range2) + range_list.Append(range3) + self.assertEqual(range_list.GetSize(), 3) + self.assertRaises(IndexError, lambda: range_list[3]) + + range1_copy = range_list.GetAddressRangeAtIndex(0) + self.assertEqual(range1.GetByteSize(), range1_copy.GetByteSize()) + self.assertEqual( + range1.GetBaseAddress().GetOffset(), + range1_copy.GetBaseAddress().GetOffset(), + ) + + range2_copy = range_list.GetAddressRangeAtIndex(1) + self.assertEqual(range2.GetByteSize(), range2_copy.GetByteSize()) + self.assertEqual( + range2.GetBaseAddress().GetOffset(), + range2_copy.GetBaseAddress().GetOffset(), + ) + + range3_copy = range_list.GetAddressRangeAtIndex(2) + self.assertEqual(range3.GetByteSize(), range3_copy.GetByteSize()) + self.assertEqual( + range3.GetBaseAddress().GetOffset(), + range3_copy.GetBaseAddress().GetOffset(), + ) + + range_list.Clear() + self.assertEqual(range_list.GetSize(), 0) + + def test_address_range_list_len(self): + """Make sure the len() operator works.""" + range = lldb.SBAddressRange(self.addr1, 8) + + range_list = lldb.SBAddressRangeList() + self.assertEqual(len(range_list), 0) + + range_list.Append(range) + self.assertEqual(len(range_list), 1) + + def test_address_range_list_iterator(self): + """Make sure the SBAddressRangeList iterator works.""" + range1 = lldb.SBAddressRange(self.addr1, 8) + range2 = lldb.SBAddressRange(self.addr2, 16) + range3 = lldb.SBAddressRange(self.addr3, 32) + + range_list = lldb.SBAddressRangeList() + range_list.Append(range1) + range_list.Append(range2) + range_list.Append(range3) + self.assertEqual(range_list.GetSize(), 3) + + # Test the iterator + for range in range_list: + self.assertTrue(range.IsValid()) + + def test_address_range_print_invalid(self): + """Make sure the SBAddressRange can be printed when invalid.""" + range = lldb.SBAddressRange() + self.assertEqual(str(range), "") + + def test_address_range_print_resolved(self): + """Make sure the SBAddressRange can be printed when resolved.""" + lldb.target = self.target + error = lldb.SBError() + process = self.target.Launch(self.launch_info, error) + self.assertTrue(error.Success(), "Make sure process launched successfully") + self.assertTrue(process, PROCESS_IS_VALID) + self.assertState(process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) + + loc = self.bp1.GetLocationAtIndex(0) + loc_addr = loc.GetAddress() + func = loc_addr.GetFunction() + range = func.GetRanges().GetAddressRangeAtIndex(0) + range_str = str(range) + # [0x1000-0x2000] // Resolved with target or addresses without sections + self.assertRegex(range_str, "^\[0x[0-9a-f]+\-0x[0-9a-f]+\)$") + process.Kill() + + def test_address_range_print_no_section_resolved(self): + """Make sure the SBAddressRange can be printed with no secion.""" + lldb.target = self.target + error = lldb.SBError() + process = self.target.Launch(self.launch_info, error) + self.assertTrue(error.Success(), "Make sure process launched successfully") + self.assertTrue(process, PROCESS_IS_VALID) + self.assertState(process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) + + loc = self.bp1.GetLocationAtIndex(0) + loc_addr = loc.GetAddress() + func = loc_addr.GetFunction() + range = func.GetRanges().GetAddressRangeAtIndex(0) + + addr = lldb.SBAddress() + addr.SetAddress(lldb.SBSection(), range.GetBaseAddress().GetOffset()) + self.assertFalse(addr.GetSection().IsValid()) + range = lldb.SBAddressRange(addr, range.GetByteSize()) + + range_str = str(range) + # [0x1000-0x2000] // Resolved with target or addresses without sections + self.assertRegex(range_str, "^\[0x[0-9a-f]+\-0x[0-9a-f]+\)$") + process.Kill() + + def test_address_range_print_not_resolved(self): + """Make sure the SBAddressRange can be printed when not resolved.""" + range = lldb.SBAddressRange(self.addr1, 8) + range_str = str(range) + # a.out[0x1000-0x2000] // Without target + self.assertRegex(range_str, "^a.out\[0x[0-9a-f]+\-0x[0-9a-f]+\)$") + + def test_address_range_list_print(self): + """Make sure the SBAddressRangeList can be printed.""" + range1 = lldb.SBAddressRange(self.addr1, 8) + range2 = lldb.SBAddressRange(self.addr2, 16) + range3 = lldb.SBAddressRange(self.addr3, 32) + self.dbg.SetAsync(True) + + range_list = lldb.SBAddressRangeList() + self.assertEqual(range_list.GetSize(), 0) + + range_list.Append(range1) + range_list.Append(range2) + range_list.Append(range3) + self.assertEqual(range_list.GetSize(), 3) + + range_list_str = str(range_list) + self.assertTrue(range_list_str.startswith("[")) + self.assertGreater(range_list_str.count(","), 1) + self.assertTrue(range_list_str.endswith("]")) + + def test_address_range_list_indexing(self): + """Make sure the SBAddressRangeList can be printed.""" + range1 = lldb.SBAddressRange(self.addr1, 8) + range2 = lldb.SBAddressRange(self.addr2, 16) + range_list = lldb.SBAddressRangeList() + range_list.Append(range1) + range_list.Append(range2) + + self.assertEqual(range_list.GetSize(), 2) + self.assertRaises(IndexError, lambda: range_list[2]) + self.assertRaises(TypeError, lambda: range_list["0"]) + self.assertEqual(range_list[0], range1) + self.assertEqual(range_list[1], range2) + self.assertEqual(range_list[-1], range2) + self.assertEqual(range_list[-2], range1) diff --git a/lldb/test/API/python_api/address_range/main.cpp b/lldb/test/API/python_api/address_range/main.cpp new file mode 100644 index 00000000000000..b6eaec4a23699b --- /dev/null +++ b/lldb/test/API/python_api/address_range/main.cpp @@ -0,0 +1,8 @@ +void foo() {} +void bar() {} + +int main() { + foo(); + bar(); + return 0; +} From 79c7342f49f1ed7aa971e7857954b45906154943 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 28 May 2024 16:31:25 +0000 Subject: [PATCH 33/89] [gn build] Port 42944e460082 --- llvm/utils/gn/secondary/lldb/source/API/BUILD.gn | 2 ++ llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn | 1 + 2 files changed, 3 insertions(+) diff --git a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn index c99c1b5483355b..f0bf6a8f3dbaf8 100644 --- a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn @@ -40,6 +40,8 @@ target(liblldb_type, "liblldb") { include_dirs = [ ".." ] sources = [ "SBAddress.cpp", + "SBAddressRange.cpp", + "SBAddressRangeList.cpp", "SBAttachInfo.cpp", "SBBlock.cpp", "SBBreakpoint.cpp", diff --git a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn index 30a9fb3ecceaa0..0c9632a0a1915f 100644 --- a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn @@ -45,6 +45,7 @@ static_library("Core") { sources = [ "Address.cpp", "AddressRange.cpp", + "AddressRangeListImpl.cpp", "AddressResolver.cpp", "AddressResolverFileLine.cpp", "Communication.cpp", From 7bea41e173367e2a535bd2188fd652a2ca267b90 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Tue, 28 May 2024 17:42:58 +0100 Subject: [PATCH 34/89] LoopIdiomRecognize: strip bad TODO (NFC) (#92890) There are several reasons why handling powi in LoopIdiomRecognize is a bad idea: - powi corresponds to a GCC builtin that is only defined for C int (which is i32 for most targets). - powi isn't always lowered by targets correctly for non-i32 parameters. Several targets fail to compile llvm.powi.f32.i16, for example. - Unlike memcpy and memset, which tend to be important enough internal intrinsics that you have to handle them correctly even in freestanding modes, powi isn't. Strip this bad TODO to avoid misleading contributors. --- llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index c7e25c9f3d2c92..3fe5478408d457 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -22,8 +22,6 @@ // // Future loop memory idioms to recognize: // memcmp, strlen, etc. -// Future floating point idioms to recognize in -ffast-math mode: -// fpowi // // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). @@ -1107,7 +1105,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( GV->setAlignment(Align(16)); Value *PatternPtr = GV; NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); - + // Set the TBAA info if present. if (AATags.TBAA) NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA); @@ -1117,7 +1115,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( if (AATags.NoAlias) NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias); - } + } NewCall->setDebugLoc(TheStore->getDebugLoc()); From 16a5fd3fdb91ffb39b97dbd3a7e9346ba406360d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 28 May 2024 18:57:38 +0200 Subject: [PATCH 35/89] DAG: Use flags in isLegalToCombineMinNumMaxNum (#93555) --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 19 +- .../AMDGPU/select-flags-to-fmin-fmax.ll | 1757 +++++++++++++++++ 2 files changed, 1768 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 93d866384b4829..2f4fdf5208d076 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11186,17 +11186,19 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) { return SDValue(); } -// FIXME: This should be checking for no signed zeros on individual operands, as -// well as no nans. static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS, - SDValue RHS, + SDValue RHS, const SDNodeFlags Flags, const TargetLowering &TLI) { - const TargetOptions &Options = DAG.getTarget().Options; EVT VT = LHS.getValueType(); + if (!VT.isFloatingPoint()) + return false; + + const TargetOptions &Options = DAG.getTarget().Options; - return Options.NoSignedZerosFPMath && VT.isFloatingPoint() && + return (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) && TLI.isProfitableToCombineMinNumMaxNum(VT) && - DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS); + (Flags.hasNoNaNs() || + (DAG.isKnownNeverNaN(RHS) && DAG.isKnownNeverNaN(LHS))); } static SDValue combineMinNumMaxNumImpl(const SDLoc &DL, EVT VT, SDValue LHS, @@ -11674,7 +11676,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // select (fcmp gt x, y), x, y -> fmaxnum x, y // // This is OK if we don't care what happens if either operand is a NaN. - if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI)) + if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, Flags, TLI)) if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2, CC)) return FMinMax; @@ -12267,7 +12269,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { // This is OK if we don't care about what happens if either operand is a // NaN. // - if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) { + if (N0.hasOneUse() && + isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, N->getFlags(), TLI)) { if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC)) return FMinMax; } diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll new file mode 100644 index 00000000000000..50a3336a7483c7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -0,0 +1,1757 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +; Test if fcmp+select patterns form min/max instructions when allowed +; by flags. + +; TODO: Merge with fmin_legacy.ll/fmax_legacy.ll + +define float @v_test_fmin_legacy_ule_f32_safe(float %a, float %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_f32_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_f32_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_f32_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule float %a, %b + %val = select i1 %cmp, float %a, float %b + ret float %val +} + +define float @v_test_fmin_legacy_ule_f32_nnan_flag(float %a, float %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_f32_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_f32_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_f32_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule float %a, %b + %val = select nnan i1 %cmp, float %a, float %b + ret float %val +} + +define float @v_test_fmin_legacy_ule_f32_nsz_flag(float %a, float %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_f32_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_f32_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_f32_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule float %a, %b + %val = select nsz i1 %cmp, float %a, float %b + ret float %val +} + +define float @v_test_fmin_legacy_ule_f32_nnan_nsz_flag(float %a, float %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_f32_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_f32_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_f32_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + ret float %val +} + +define float @v_test_fmax_legacy_uge_f32_safe(float %a, float %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_f32_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_f32_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_f32_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge float %a, %b + %val = select i1 %cmp, float %a, float %b + ret float %val +} + +define float @v_test_fmax_legacy_uge_f32_nnan_flag(float %a, float %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge float %a, %b + %val = select nnan i1 %cmp, float %a, float %b + ret float %val +} + +define float @v_test_fmax_legacy_uge_f32_nsz_flag(float %a, float %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge float %a, %b + %val = select nsz i1 %cmp, float %a, float %b + ret float %val +} + +define float @v_test_fmax_legacy_uge_f32_nnan_nsz_flag(float %a, float %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge float %a, %b + %val = select nnan nsz i1 %cmp, float %a, float %b + ret float %val +} + +define <2 x float> @v_test_fmin_legacy_ule_v2f32_safe(<2 x float> %a, <2 x float> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <2 x float> %a, %b + %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} + +define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_flag(<2 x float> %a, <2 x float> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <2 x float> %a, %b + %val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} + +define <2 x float> @v_test_fmin_legacy_ule_v2f32_nsz_flag(<2 x float> %a, <2 x float> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <2 x float> %a, %b + %val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} + +define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag(<2 x float> %a, <2 x float> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <2 x float> %a, %b + %val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} + +define <2 x float> @v_test_fmax_legacy_uge_v2f32_safe(<2 x float> %a, <2 x float> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <2 x float> %a, %b + %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} + +define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_flag(<2 x float> %a, <2 x float> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <2 x float> %a, %b + %val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} + +define <2 x float> @v_test_fmax_legacy_uge_v2f32_nsz_flag(<2 x float> %a, <2 x float> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <2 x float> %a, %b + %val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} + +define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag(<2 x float> %a, <2 x float> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <2 x float> %a, %b + %val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} + +define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_f16_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_f16_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_f16_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule half %a, %b + %val = select i1 %cmp, half %a, half %b + ret half %val +} + +define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule half %a, %b + %val = select nnan i1 %cmp, half %a, half %b + ret half %val +} + +define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule half %a, %b + %val = select nsz i1 %cmp, half %a, half %b + ret half %val +} + +define half @v_test_fmin_legacy_ule_f16_nnan_nsz_flag(half %a, half %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule half %a, %b + %val = select nnan nsz i1 %cmp, half %a, half %b + ret half %val +} + +define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_f16_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_f16_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_f16_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge half %a, %b + %val = select i1 %cmp, half %a, half %b + ret half %val +} + +define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge half %a, %b + %val = select nnan i1 %cmp, half %a, half %b + ret half %val +} + +define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge half %a, %b + %val = select nsz i1 %cmp, half %a, half %b + ret half %val +} + +define half @v_test_fmax_legacy_uge_f16_nnan_nsz_flag(half %a, half %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge half %a, %b + %val = select nnan nsz i1 %cmp, half %a, half %b + ret half %val +} + +define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <2 x half> %a, %b + %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + +define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x half> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <2 x half> %a, %b + %val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + +define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x half> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <2 x half> %a, %b + %val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + +define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag(<2 x half> %a, <2 x half> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <2 x half> %a, %b + %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + +define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <2 x half> %a, %b + %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + +define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x half> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <2 x half> %a, %b + %val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + +define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x half> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <2 x half> %a, %b + %val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + +define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag(<2 x half> %a, <2 x half> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <2 x half> %a, %b + %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b + ret <2 x half> %val +} + +define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v4, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v5, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v2, v6, v2 +; GFX7-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <4 x half> %a, %b + %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + +define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x half> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v4, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v5, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v2, v6, v2 +; GFX7-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <4 x half> %a, %b + %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + +define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x half> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v4, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v5, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v2, v6, v2 +; GFX7-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <4 x half> %a, %b + %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + +define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag(<4 x half> %a, <4 x half> %b) { +; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v4, v0 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v5, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v2, v6, v2 +; GFX7-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ule <4 x half> %a, %b + %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + +define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_safe: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v4, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v5, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v2, v6, v2 +; GFX7-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_safe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <4 x half> %a, %b + %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + +define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x half> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v4, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v5, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v2, v6, v2 +; GFX7-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <4 x half> %a, %b + %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + +define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x half> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v4, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v5, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v2, v6, v2 +; GFX7-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <4 x half> %a, %b + %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + +define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag(<4 x half> %a, <4 x half> %b) { +; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v4, v0 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v5, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v2, v6, v2 +; GFX7-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp uge <4 x half> %a, %b + %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b + ret <4 x half> %val +} + +define float @v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float %arg1) { +; GFX7-LABEL: v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a = fadd nnan float %arg0, %arg0 + %b = fadd nnan float %arg1, %arg1 + %cmp = fcmp ule float %a, %b + %val = select nsz i1 %cmp, float %a, float %b + ret float %val +} + +define float @v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float %arg1) { +; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a = fadd nnan float %arg0, %arg0 + %b = fadd nnan float %arg1, %arg1 + %cmp = fcmp uge float %a, %b + %val = select nsz i1 %cmp, float %a, float %b + ret float %val +} From b963931eb8bda810e2a8ad08832402993b931d69 Mon Sep 17 00:00:00 2001 From: alx32 <103613512+alx32@users.noreply.github.com> Date: Tue, 28 May 2024 10:21:22 -0700 Subject: [PATCH 36/89] [lld-macho][ObjC] Implement category merging into base class (#92448) Currently category merging only supports merging multiple categories into one. With this commit we add the ability to fully merge categories into the base class, if the base class is included in the current module. This is the optimal approach for defined classes. --- lld/MachO/ObjC.cpp | 179 +++++++++++++-- .../objc-category-merging-complete-test.s | 210 ++++++++++++++++++ ...imal.s => objc-category-merging-minimal.s} | 125 ++++++++++- 3 files changed, 500 insertions(+), 14 deletions(-) rename lld/test/MachO/{objc-category-merging-extern-class-minimal.s => objc-category-merging-minimal.s} (59%) diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp index 9d1612beae872e..635ded554497ba 100644 --- a/lld/MachO/ObjC.cpp +++ b/lld/MachO/ObjC.cpp @@ -379,12 +379,21 @@ class ObjcCategoryMerger { InfoWriteSection catPtrListInfo; }; - // Information about a pointer list in the original categories (method lists, - // protocol lists, etc) + // Information about a pointer list in the original categories or class(method + // lists, protocol lists, etc) struct PointerListInfo { + PointerListInfo() = default; + PointerListInfo(const PointerListInfo &) = default; PointerListInfo(const char *_categoryPrefix, uint32_t _pointersPerStruct) : categoryPrefix(_categoryPrefix), pointersPerStruct(_pointersPerStruct) {} + + inline bool operator==(const PointerListInfo &cmp) { + return pointersPerStruct == cmp.pointersPerStruct && + structSize == cmp.structSize && structCount == cmp.structCount && + allPtrs == cmp.allPtrs; + } + const char *categoryPrefix; uint32_t pointersPerStruct = 0; @@ -395,9 +404,9 @@ class ObjcCategoryMerger { std::vector allPtrs; }; - // Full information about all the categories that extend a class. This will - // include all the additional methods, protocols, and properties that are - // contained in all the categories that extend a particular class. + // Full information describing an ObjC class . This will include all the + // additional methods, protocols, and properties that are contained in the + // class and all the categories that extend a particular class. struct ClassExtensionInfo { ClassExtensionInfo(CategoryLayout &_catLayout) : catLayout(_catLayout){}; @@ -449,6 +458,9 @@ class ObjcCategoryMerger { void parseProtocolListInfo(const ConcatInputSection *isec, uint32_t secOffset, PointerListInfo &ptrList); + PointerListInfo parseProtocolListInfo(const ConcatInputSection *isec, + uint32_t secOffset); + void parsePointerListInfo(const ConcatInputSection *isec, uint32_t secOffset, PointerListInfo &ptrList); @@ -456,9 +468,9 @@ class ObjcCategoryMerger { const ClassExtensionInfo &extInfo, const PointerListInfo &ptrList); - void emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset, - const ClassExtensionInfo &extInfo, - const PointerListInfo &ptrList); + Defined *emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset, + const ClassExtensionInfo &extInfo, + const PointerListInfo &ptrList); Defined *emitCategory(const ClassExtensionInfo &extInfo); Defined *emitCatListEntrySec(const std::string &forCategoryName, @@ -474,6 +486,10 @@ class ObjcCategoryMerger { uint32_t offset); Defined *tryGetDefinedAtIsecOffset(const ConcatInputSection *isec, uint32_t offset); + Defined *getClassRo(const Defined *classSym, bool getMetaRo); + void mergeCategoriesIntoBaseClass(const Defined *baseClass, + std::vector &categories); + void eraseSymbolAtIsecOffset(ConcatInputSection *isec, uint32_t offset); void tryEraseDefinedAtIsecOffset(const ConcatInputSection *isec, uint32_t offset); @@ -552,6 +568,29 @@ ObjcCategoryMerger::tryGetDefinedAtIsecOffset(const ConcatInputSection *isec, return dyn_cast_or_null(sym); } +// Get the class's ro_data symbol. If getMetaRo is true, then we will return +// the meta-class's ro_data symbol. Otherwise, we will return the class +// (instance) ro_data symbol. +Defined *ObjcCategoryMerger::getClassRo(const Defined *classSym, + bool getMetaRo) { + ConcatInputSection *isec = dyn_cast(classSym->isec()); + if (!isec) + return nullptr; + + if (!getMetaRo) + return tryGetDefinedAtIsecOffset(isec, classLayout.roDataOffset + + classSym->value); + + Defined *metaClass = tryGetDefinedAtIsecOffset( + isec, classLayout.metaClassOffset + classSym->value); + if (!metaClass) + return nullptr; + + return tryGetDefinedAtIsecOffset( + dyn_cast(metaClass->isec()), + classLayout.roDataOffset); +} + // Given an ConcatInputSection or CStringInputSection and an offset, if there is // a symbol(Defined) at that offset, then erase the symbol (mark it not live) void ObjcCategoryMerger::tryEraseDefinedAtIsecOffset( @@ -663,6 +702,15 @@ void ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec, "Protocol list end offset does not match expected size"); } +// Parse a protocol list and return the PointerListInfo for it +ObjcCategoryMerger::PointerListInfo +ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec, + uint32_t secOffset) { + PointerListInfo ptrList; + parseProtocolListInfo(isec, secOffset, ptrList); + return ptrList; +} + // Parse a pointer list that might be linked to ConcatInputSection at a given // offset. This can be used for instance methods, class methods, instance props // and class props since they have the same format. @@ -769,11 +817,11 @@ void ObjcCategoryMerger::parseCatInfoToExtInfo(const InfoInputCategory &catInfo, // Generate a protocol list (including header) and link it into the parent at // the specified offset. -void ObjcCategoryMerger::emitAndLinkProtocolList( +Defined *ObjcCategoryMerger::emitAndLinkProtocolList( Defined *parentSym, uint32_t linkAtOffset, const ClassExtensionInfo &extInfo, const PointerListInfo &ptrList) { if (ptrList.allPtrs.empty()) - return; + return nullptr; assert(ptrList.allPtrs.size() == ptrList.structCount); @@ -820,6 +868,8 @@ void ObjcCategoryMerger::emitAndLinkProtocolList( infoCategoryWriter.catPtrListInfo.relocTemplate); offset += target->wordSize; } + + return ptrListSym; } // Generate a pointer list (including header) and link it into the parent at the @@ -1265,10 +1315,15 @@ void ObjcCategoryMerger::removeRefsToErasedIsecs() { void ObjcCategoryMerger::doMerge() { collectAndValidateCategoriesData(); - for (auto &entry : categoryMap) - if (entry.second.size() > 1) + for (auto &[baseClass, catInfos] : categoryMap) { + if (auto *baseClassDef = dyn_cast(baseClass)) { + // Merge all categories into the base class + mergeCategoriesIntoBaseClass(baseClassDef, catInfos); + } else if (catInfos.size() > 1) { // Merge all categories into a new, single category - mergeCategoriesIntoSingleCategory(entry.second); + mergeCategoriesIntoSingleCategory(catInfos); + } + } // Erase all categories that were merged eraseMergedCategories(); @@ -1302,3 +1357,101 @@ void objc::mergeCategories() { } void objc::doCleanup() { ObjcCategoryMerger::doCleanup(); } + +void ObjcCategoryMerger::mergeCategoriesIntoBaseClass( + const Defined *baseClass, std::vector &categories) { + assert(categories.size() >= 1 && "Expected at least one category to merge"); + + // Collect all the info from the categories + ClassExtensionInfo extInfo(catLayout); + for (auto &catInfo : categories) { + parseCatInfoToExtInfo(catInfo, extInfo); + } + + // Get metadata for the base class + Defined *metaRo = getClassRo(baseClass, /*getMetaRo=*/true); + ConcatInputSection *metaIsec = dyn_cast(metaRo->isec()); + Defined *classRo = getClassRo(baseClass, /*getMetaRo=*/false); + ConcatInputSection *classIsec = dyn_cast(classRo->isec()); + + // Now collect the info from the base class from the various lists in the + // class metadata + + // Protocol lists are a special case - the same protocol list is in classRo + // and metaRo, so we only need to parse it once + parseProtocolListInfo(classIsec, roClassLayout.baseProtocolsOffset, + extInfo.protocols); + + // Check that the classRo and metaRo protocol lists are identical + assert( + parseProtocolListInfo(classIsec, roClassLayout.baseProtocolsOffset) == + parseProtocolListInfo(metaIsec, roClassLayout.baseProtocolsOffset) && + "Category merger expects classRo and metaRo to have the same protocol " + "list"); + + parsePointerListInfo(metaIsec, roClassLayout.baseMethodsOffset, + extInfo.classMethods); + parsePointerListInfo(classIsec, roClassLayout.baseMethodsOffset, + extInfo.instanceMethods); + + parsePointerListInfo(metaIsec, roClassLayout.basePropertiesOffset, + extInfo.classProps); + parsePointerListInfo(classIsec, roClassLayout.basePropertiesOffset, + extInfo.instanceProps); + + // Erase the old lists - these will be generated and replaced + eraseSymbolAtIsecOffset(metaIsec, roClassLayout.baseMethodsOffset); + eraseSymbolAtIsecOffset(metaIsec, roClassLayout.baseProtocolsOffset); + eraseSymbolAtIsecOffset(metaIsec, roClassLayout.basePropertiesOffset); + eraseSymbolAtIsecOffset(classIsec, roClassLayout.baseMethodsOffset); + eraseSymbolAtIsecOffset(classIsec, roClassLayout.baseProtocolsOffset); + eraseSymbolAtIsecOffset(classIsec, roClassLayout.basePropertiesOffset); + + // Emit the newly merged lists - first into the meta RO then into the class RO + // First we emit and link the protocol list into the meta RO. Then we link it + // in the classRo as well (they're supposed to be identical) + if (Defined *protoListSym = + emitAndLinkProtocolList(metaRo, roClassLayout.baseProtocolsOffset, + extInfo, extInfo.protocols)) { + createSymbolReference(classRo, protoListSym, + roClassLayout.baseProtocolsOffset, + infoCategoryWriter.catBodyInfo.relocTemplate); + } + + emitAndLinkPointerList(metaRo, roClassLayout.baseMethodsOffset, extInfo, + extInfo.classMethods); + emitAndLinkPointerList(classRo, roClassLayout.baseMethodsOffset, extInfo, + extInfo.instanceMethods); + + emitAndLinkPointerList(metaRo, roClassLayout.basePropertiesOffset, extInfo, + extInfo.classProps); + + emitAndLinkPointerList(classRo, roClassLayout.basePropertiesOffset, extInfo, + extInfo.instanceProps); + + // Mark all the categories as merged - this will be used to erase them later + for (auto &catInfo : categories) + catInfo.wasMerged = true; +} + +// Erase the symbol at a given offset in an InputSection +void ObjcCategoryMerger::eraseSymbolAtIsecOffset(ConcatInputSection *isec, + uint32_t offset) { + Defined *sym = tryGetDefinedAtIsecOffset(isec, offset); + if (!sym) + return; + + // Remove the symbol from isec->symbols + assert(isa(sym) && "Can only erase a Defined"); + llvm::erase(isec->symbols, sym); + + // Remove the relocs that refer to this symbol + auto removeAtOff = [offset](Reloc const &r) { return r.offset == offset; }; + llvm::erase_if(isec->relocs, removeAtOff); + + // Now, if the symbol fully occupies a ConcatInputSection, we can also erase + // the whole ConcatInputSection + if (ConcatInputSection *cisec = dyn_cast(sym->isec())) + if (cisec->data.size() == sym->size) + eraseISec(cisec); +} diff --git a/lld/test/MachO/objc-category-merging-complete-test.s b/lld/test/MachO/objc-category-merging-complete-test.s index 74400177b550dc..cf3e19e2f9c8b4 100644 --- a/lld/test/MachO/objc-category-merging-complete-test.s +++ b/lld/test/MachO/objc-category-merging-complete-test.s @@ -1,6 +1,7 @@ # REQUIRES: aarch64 # RUN: rm -rf %t; split-file %s %t && cd %t +############ Test merging multiple categories into a single category ############ ## Create a dylib to link against(a64_file1.dylib) and merge categories in the main binary (file2_merge_a64.exe) # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_file1.o a64_file1.s # RUN: %lld -arch arm64 a64_file1.o -o a64_file1.dylib -dylib @@ -12,6 +13,10 @@ # RUN: llvm-objdump --objc-meta-data --macho a64_file2_no_merge.exe | FileCheck %s --check-prefixes=NO_MERGE_CATS # RUN: llvm-objdump --objc-meta-data --macho a64_file2_merge.exe | FileCheck %s --check-prefixes=MERGE_CATS +############ Test merging multiple categories into the base class ############ +# RUN: %lld -arch arm64 -o a64_file2_merge_into_class.exe -objc_category_merging a64_file1.o a64_file2.o +# RUN: llvm-objdump --objc-meta-data --macho a64_file2_merge_into_class.exe | FileCheck %s --check-prefixes=MERGE_CATS_CLS + MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass(Category02|Category03) MERGE_CATS-NEXT: name {{.*}} Category02|Category03 @@ -101,6 +106,211 @@ NO_MERGE_CATS-NEXT: 24 NO_MERGE_CATS-NEXT: 2 +MERGE_CATS_CLS: _OBJC_CLASS_$_MyBaseClass +MERGE_CATS_CLS-NEXT: isa {{.*}} _OBJC_METACLASS_$_MyBaseClass +MERGE_CATS_CLS-NEXT: superclass 0x0 +MERGE_CATS_CLS-NEXT: cache {{.*}} __objc_empty_cache +MERGE_CATS_CLS-NEXT: vtable 0x0 +MERGE_CATS_CLS-NEXT: data {{.*}} (struct class_ro_t *) +MERGE_CATS_CLS-NEXT: flags 0x2 RO_ROOT +MERGE_CATS_CLS-NEXT: instanceStart 0 +MERGE_CATS_CLS-NEXT: instanceSize 4 +MERGE_CATS_CLS-NEXT: reserved 0x0 +MERGE_CATS_CLS-NEXT: ivarLayout 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyBaseClass +MERGE_CATS_CLS-NEXT: baseMethods {{.*}} (struct method_list_t *) +MERGE_CATS_CLS-NEXT: entsize 24 +MERGE_CATS_CLS-NEXT: count 8 +MERGE_CATS_CLS-NEXT: name {{.*}} class02InstanceMethod +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp -[MyBaseClass(Category02) class02InstanceMethod] +MERGE_CATS_CLS-NEXT: name {{.*}} myProtocol02Method +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp -[MyBaseClass(Category02) myProtocol02Method] +MERGE_CATS_CLS-NEXT: name {{.*}} class03InstanceMethod +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp -[MyBaseClass(Category03) class03InstanceMethod] +MERGE_CATS_CLS-NEXT: name {{.*}} myProtocol03Method +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp -[MyBaseClass(Category03) myProtocol03Method] +MERGE_CATS_CLS-NEXT: name {{.*}} baseInstanceMethod +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp -[MyBaseClass baseInstanceMethod] +MERGE_CATS_CLS-NEXT: name {{.*}} myProtocol01Method +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp -[MyBaseClass myProtocol01Method] +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol01Prop +MERGE_CATS_CLS-NEXT: types {{.*}} i16@0:8 +MERGE_CATS_CLS-NEXT: imp -[MyBaseClass MyProtocol01Prop] +MERGE_CATS_CLS-NEXT: name {{.*}} setMyProtocol01Prop: +MERGE_CATS_CLS-NEXT: types {{.*}} v20@0:8i16 +MERGE_CATS_CLS-NEXT: imp -[MyBaseClass setMyProtocol01Prop:] +MERGE_CATS_CLS-NEXT: baseProtocols {{.*}} +MERGE_CATS_CLS-NEXT: count 3 +MERGE_CATS_CLS-NEXT: list[0] {{.*}} (struct protocol_t *) +MERGE_CATS_CLS-NEXT: isa 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol02 +MERGE_CATS_CLS-NEXT: protocols 0x0 +MERGE_CATS_CLS-NEXT: instanceMethods {{.*}} (struct method_list_t *) +MERGE_CATS_CLS-NEXT: entsize 24 +MERGE_CATS_CLS-NEXT: count 2 +MERGE_CATS_CLS-NEXT: name {{.*}} myProtocol02Method +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol02Prop +MERGE_CATS_CLS-NEXT: types {{.*}} i16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: classMethods 0x0 (struct method_list_t *) +MERGE_CATS_CLS-NEXT: optionalInstanceMethods 0x0 +MERGE_CATS_CLS-NEXT: optionalClassMethods 0x0 +MERGE_CATS_CLS-NEXT: instanceProperties {{.*}} +MERGE_CATS_CLS-NEXT: list[1] {{.*}} (struct protocol_t *) +MERGE_CATS_CLS-NEXT: isa 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol03 +MERGE_CATS_CLS-NEXT: protocols 0x0 +MERGE_CATS_CLS-NEXT: instanceMethods {{.*}} (struct method_list_t *) +MERGE_CATS_CLS-NEXT: entsize 24 +MERGE_CATS_CLS-NEXT: count 2 +MERGE_CATS_CLS-NEXT: name {{.*}} myProtocol03Method +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol03Prop +MERGE_CATS_CLS-NEXT: types {{.*}} i16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: classMethods 0x0 (struct method_list_t *) +MERGE_CATS_CLS-NEXT: optionalInstanceMethods 0x0 +MERGE_CATS_CLS-NEXT: optionalClassMethods 0x0 +MERGE_CATS_CLS-NEXT: instanceProperties {{.*}} +MERGE_CATS_CLS-NEXT: list[2] {{.*}} (struct protocol_t *) +MERGE_CATS_CLS-NEXT: isa 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol01 +MERGE_CATS_CLS-NEXT: protocols 0x0 +MERGE_CATS_CLS-NEXT: instanceMethods {{.*}} (struct method_list_t *) +MERGE_CATS_CLS-NEXT: entsize 24 +MERGE_CATS_CLS-NEXT: count 3 +MERGE_CATS_CLS-NEXT: name {{.*}} myProtocol01Method +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol01Prop +MERGE_CATS_CLS-NEXT: types {{.*}} i16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} setMyProtocol01Prop: +MERGE_CATS_CLS-NEXT: types {{.*}} v20@0:8i16 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: classMethods 0x0 (struct method_list_t *) +MERGE_CATS_CLS-NEXT: optionalInstanceMethods 0x0 +MERGE_CATS_CLS-NEXT: optionalClassMethods 0x0 +MERGE_CATS_CLS-NEXT: instanceProperties {{.*}} +MERGE_CATS_CLS-NEXT: ivars {{.*}} +MERGE_CATS_CLS-NEXT: entsize 32 +MERGE_CATS_CLS-NEXT: count 1 +MERGE_CATS_CLS-NEXT: offset {{.*}} 0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol01Prop +MERGE_CATS_CLS-NEXT: type {{.*}} i +MERGE_CATS_CLS-NEXT: alignment 2 +MERGE_CATS_CLS-NEXT: size 4 +MERGE_CATS_CLS-NEXT: weakIvarLayout 0x0 +MERGE_CATS_CLS-NEXT: baseProperties {{.*}} +MERGE_CATS_CLS-NEXT: entsize 16 +MERGE_CATS_CLS-NEXT: count 3 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol02Prop +MERGE_CATS_CLS-NEXT: attributes {{.*}} Ti,R,D +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol03Prop +MERGE_CATS_CLS-NEXT: attributes {{.*}} Ti,R,D +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol01Prop +MERGE_CATS_CLS-NEXT: attributes {{.*}} Ti,N,VMyProtocol01Prop +MERGE_CATS_CLS-NEXT: Meta Class +MERGE_CATS_CLS-NEXT: isa {{.*}} _OBJC_METACLASS_$_MyBaseClass +MERGE_CATS_CLS-NEXT: superclass {{.*}} _OBJC_CLASS_$_MyBaseClass +MERGE_CATS_CLS-NEXT: cache {{.*}} __objc_empty_cache +MERGE_CATS_CLS-NEXT: vtable 0x0 +MERGE_CATS_CLS-NEXT: data {{.*}} (struct class_ro_t *) +MERGE_CATS_CLS-NEXT: flags 0x3 RO_META RO_ROOT +MERGE_CATS_CLS-NEXT: instanceStart 40 +MERGE_CATS_CLS-NEXT: instanceSize 40 +MERGE_CATS_CLS-NEXT: reserved 0x0 +MERGE_CATS_CLS-NEXT: ivarLayout 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyBaseClass +MERGE_CATS_CLS-NEXT: baseMethods {{.*}} (struct method_list_t *) +MERGE_CATS_CLS-NEXT: entsize 24 +MERGE_CATS_CLS-NEXT: count 5 +MERGE_CATS_CLS-NEXT: name {{.*}} class02ClassMethod +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp +[MyBaseClass(Category02) class02ClassMethod] +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol02Prop +MERGE_CATS_CLS-NEXT: types {{.*}} i16@0:8 +MERGE_CATS_CLS-NEXT: imp +[MyBaseClass(Category02) MyProtocol02Prop] +MERGE_CATS_CLS-NEXT: name {{.*}} class03ClassMethod +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp +[MyBaseClass(Category03) class03ClassMethod] +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol03Prop +MERGE_CATS_CLS-NEXT: types {{.*}} i16@0:8 +MERGE_CATS_CLS-NEXT: imp +[MyBaseClass(Category03) MyProtocol03Prop] +MERGE_CATS_CLS-NEXT: name {{.*}} baseClassMethod +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp +[MyBaseClass baseClassMethod] +MERGE_CATS_CLS-NEXT: baseProtocols {{.*}} +MERGE_CATS_CLS-NEXT: count 3 +MERGE_CATS_CLS-NEXT: list[0] {{.*}} (struct protocol_t *) +MERGE_CATS_CLS-NEXT: isa 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol02 +MERGE_CATS_CLS-NEXT: protocols 0x0 +MERGE_CATS_CLS-NEXT: instanceMethods {{.*}} (struct method_list_t *) +MERGE_CATS_CLS-NEXT: entsize 24 +MERGE_CATS_CLS-NEXT: count 2 +MERGE_CATS_CLS-NEXT: name {{.*}} myProtocol02Method +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol02Prop +MERGE_CATS_CLS-NEXT: types {{.*}} i16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: classMethods 0x0 (struct method_list_t *) +MERGE_CATS_CLS-NEXT: optionalInstanceMethods 0x0 +MERGE_CATS_CLS-NEXT: optionalClassMethods 0x0 +MERGE_CATS_CLS-NEXT: instanceProperties {{.*}} +MERGE_CATS_CLS-NEXT: list[1] {{.*}} (struct protocol_t *) +MERGE_CATS_CLS-NEXT: isa 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol03 +MERGE_CATS_CLS-NEXT: protocols 0x0 +MERGE_CATS_CLS-NEXT: instanceMethods {{.*}} (struct method_list_t *) +MERGE_CATS_CLS-NEXT: entsize 24 +MERGE_CATS_CLS-NEXT: count 2 +MERGE_CATS_CLS-NEXT: name {{.*}} myProtocol03Method +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol03Prop +MERGE_CATS_CLS-NEXT: types {{.*}} i16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: classMethods 0x0 (struct method_list_t *) +MERGE_CATS_CLS-NEXT: optionalInstanceMethods 0x0 +MERGE_CATS_CLS-NEXT: optionalClassMethods 0x0 +MERGE_CATS_CLS-NEXT: instanceProperties {{.*}} +MERGE_CATS_CLS-NEXT: list[2] {{.*}} (struct protocol_t *) +MERGE_CATS_CLS-NEXT: isa 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol01 +MERGE_CATS_CLS-NEXT: protocols 0x0 +MERGE_CATS_CLS-NEXT: instanceMethods {{.*}} (struct method_list_t *) +MERGE_CATS_CLS-NEXT: entsize 24 +MERGE_CATS_CLS-NEXT: count 3 +MERGE_CATS_CLS-NEXT: name {{.*}} myProtocol01Method +MERGE_CATS_CLS-NEXT: types {{.*}} v16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} MyProtocol01Prop +MERGE_CATS_CLS-NEXT: types {{.*}} i16@0:8 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: name {{.*}} setMyProtocol01Prop: +MERGE_CATS_CLS-NEXT: types {{.*}} v20@0:8i16 +MERGE_CATS_CLS-NEXT: imp 0x0 +MERGE_CATS_CLS-NEXT: classMethods 0x0 (struct method_list_t *) +MERGE_CATS_CLS-NEXT: optionalInstanceMethods 0x0 +MERGE_CATS_CLS-NEXT: optionalClassMethods 0x0 +MERGE_CATS_CLS-NEXT: instanceProperties {{.*}} +MERGE_CATS_CLS-NEXT: ivars 0x0 +MERGE_CATS_CLS-NEXT: weakIvarLayout 0x0 +MERGE_CATS_CLS-NEXT: baseProperties 0x0 +MERGE_CATS_CLS: __OBJC_$_CATEGORY_MyBaseClass_$_Category04 + + #--- a64_file1.s ## @protocol MyProtocol01 diff --git a/lld/test/MachO/objc-category-merging-extern-class-minimal.s b/lld/test/MachO/objc-category-merging-minimal.s similarity index 59% rename from lld/test/MachO/objc-category-merging-extern-class-minimal.s rename to lld/test/MachO/objc-category-merging-minimal.s index 5dd8924df5ad68..fcd90f178b150e 100644 --- a/lld/test/MachO/objc-category-merging-extern-class-minimal.s +++ b/lld/test/MachO/objc-category-merging-minimal.s @@ -1,7 +1,8 @@ # REQUIRES: aarch64 # RUN: rm -rf %t; split-file %s %t && cd %t -## Create a dylib with a fake base class to link against +############ Test merging multiple categories into a single category ############ +## Create a dylib with a fake base class to link against in when merging between categories # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_fakedylib.o a64_fakedylib.s # RUN: %lld -arch arm64 a64_fakedylib.o -o a64_fakedylib.dylib -dylib @@ -14,6 +15,15 @@ # RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_no_merge.dylib | FileCheck %s --check-prefixes=NO_MERGE_CATS # RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_merge.dylib | FileCheck %s --check-prefixes=MERGE_CATS +############ Test merging multiple categories into the base class ############ +# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_base_class_minimal.o merge_base_class_minimal.s +# RUN: %lld -arch arm64 -dylib -o merge_base_class_minimal_yes_merge.dylib -objc_category_merging merge_base_class_minimal.o merge_cat_minimal.o +# RUN: %lld -arch arm64 -dylib -o merge_base_class_minimal_no_merge.dylib merge_base_class_minimal.o merge_cat_minimal.o + +# RUN: llvm-objdump --objc-meta-data --macho merge_base_class_minimal_no_merge.dylib | FileCheck %s --check-prefixes=NO_MERGE_INTO_BASE +# RUN: llvm-objdump --objc-meta-data --macho merge_base_class_minimal_yes_merge.dylib | FileCheck %s --check-prefixes=YES_MERGE_INTO_BASE + + #### Check merge categories enabled ### # Check that the original categories are not there MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01 @@ -44,6 +54,28 @@ NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category01 NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category02 +#### Check merge cateogires into base class is disabled #### +NO_MERGE_INTO_BASE: __OBJC_$_CATEGORY_MyBaseClass_$_Category01 +NO_MERGE_INTO_BASE: __OBJC_$_CATEGORY_MyBaseClass_$_Category02 + +#### Check merge cateogires into base class is enabled and categories are merged into base class #### +YES_MERGE_INTO_BASE-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01 +YES_MERGE_INTO_BASE-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category02 + +YES_MERGE_INTO_BASE: _OBJC_CLASS_$_MyBaseClass +YES_MERGE_INTO_BASE-NEXT: _OBJC_METACLASS_$_MyBaseClass +YES_MERGE_INTO_BASE: baseMethods +YES_MERGE_INTO_BASE-NEXT: entsize 24 +YES_MERGE_INTO_BASE-NEXT: count 3 +YES_MERGE_INTO_BASE-NEXT: name {{.*}} cat01_InstanceMethod +YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8 +YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass(Category01) cat01_InstanceMethod] +YES_MERGE_INTO_BASE-NEXT: name {{.*}} cat02_InstanceMethod +YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8 +YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass(Category02) cat02_InstanceMethod] +YES_MERGE_INTO_BASE-NEXT: name {{.*}} baseInstanceMethod +YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8 +YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass baseInstanceMethod] #--- a64_fakedylib.s @@ -156,3 +188,94 @@ L_OBJC_IMAGE_INFO: .addrsig .addrsig_sym __OBJC_$_CATEGORY_MyBaseClass_$_Category01 + +#--- merge_base_class_minimal.s +; clang -c merge_base_class_minimal.mm -O3 -target arm64-apple-macos -arch arm64 -S -o merge_base_class_minimal.s +; ================== Generated from ObjC: ================== +; __attribute__((objc_root_class)) +; @interface MyBaseClass +; - (void)baseInstanceMethod; +; @end +; +; @implementation MyBaseClass +; - (void)baseInstanceMethod {} +; @end +; ================== Generated from ObjC ================== + .section __TEXT,__text,regular,pure_instructions + .build_version macos, 11, 0 + .p2align 2 +"-[MyBaseClass baseInstanceMethod]": + .cfi_startproc +; %bb.0: + ret + .cfi_endproc + .section __DATA,__objc_data + .globl _OBJC_CLASS_$_MyBaseClass + .p2align 3, 0x0 +_OBJC_CLASS_$_MyBaseClass: + .quad _OBJC_METACLASS_$_MyBaseClass + .quad 0 + .quad 0 + .quad 0 + .quad __OBJC_CLASS_RO_$_MyBaseClass + .globl _OBJC_METACLASS_$_MyBaseClass + .p2align 3, 0x0 +_OBJC_METACLASS_$_MyBaseClass: + .quad _OBJC_METACLASS_$_MyBaseClass + .quad _OBJC_CLASS_$_MyBaseClass + .quad 0 + .quad 0 + .quad __OBJC_METACLASS_RO_$_MyBaseClass + .section __TEXT,__objc_classname,cstring_literals +l_OBJC_CLASS_NAME_: + .asciz "MyBaseClass" + .section __DATA,__objc_const + .p2align 3, 0x0 +__OBJC_METACLASS_RO_$_MyBaseClass: + .long 3 + .long 40 + .long 40 + .space 4 + .quad 0 + .quad l_OBJC_CLASS_NAME_ + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .section __TEXT,__objc_methname,cstring_literals +l_OBJC_METH_VAR_NAME_: + .asciz "baseInstanceMethod" + .section __TEXT,__objc_methtype,cstring_literals +l_OBJC_METH_VAR_TYPE_: + .asciz "v16@0:8" + .section __DATA,__objc_const + .p2align 3, 0x0 +__OBJC_$_INSTANCE_METHODS_MyBaseClass: + .long 24 + .long 1 + .quad l_OBJC_METH_VAR_NAME_ + .quad l_OBJC_METH_VAR_TYPE_ + .quad "-[MyBaseClass baseInstanceMethod]" + .p2align 3, 0x0 +__OBJC_CLASS_RO_$_MyBaseClass: + .long 2 + .long 0 + .long 0 + .space 4 + .quad 0 + .quad l_OBJC_CLASS_NAME_ + .quad __OBJC_$_INSTANCE_METHODS_MyBaseClass + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .section __DATA,__objc_classlist,regular,no_dead_strip + .p2align 3, 0x0 +l_OBJC_LABEL_CLASS_$: + .quad _OBJC_CLASS_$_MyBaseClass + .section __DATA,__objc_imageinfo,regular,no_dead_strip +L_OBJC_IMAGE_INFO: + .long 0 + .long 64 +.subsections_via_symbols From d1d863c012cf3d5b407ae06d23a5628ec9510b7c Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Tue, 28 May 2024 10:32:09 -0700 Subject: [PATCH 37/89] [lldb] Remove lldbassert in AppleObjCTypeEncodingParser (#93332) AppleObjCTypeEncodingParser::BuildObjCObjectPointerType currently contains an lldbassert to detect situations where we have a forward declaration without a definition. According to the accompanying comment, its purpose is to catch "weird cases" during test suite runs. However, because this is an lldbassert, we show a scary message to our users who think this is a problem and report the issue to us. Unfortunately those reports aren't very actionable without a way to know the name of the type. This patch changes the lldbassert to a regular assert and emits a log message to the types log when this happens. rdar://127439898 --- .../AppleObjCTypeEncodingParser.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp index ca582cb1d5a46f..4871c59faefccc 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp @@ -13,6 +13,8 @@ #include "lldb/Symbol/CompilerType.h" #include "lldb/Target/Process.h" #include "lldb/Target/Target.h" +#include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/Log.h" #include "lldb/Utility/StringLexer.h" #include "clang/Basic/TargetInfo.h" @@ -234,12 +236,15 @@ clang::QualType AppleObjCTypeEncodingParser::BuildObjCObjectPointerType( auto types = decl_vendor->FindTypes(ConstString(name), /*max_matches*/ 1); - // The user can forward-declare something that has no definition. The runtime - // doesn't prohibit this at all. This is a rare and very weird case. We keep - // this assert in debug builds so we catch other weird cases. - lldbassert(!types.empty()); - if (types.empty()) + if (types.empty()) { + // The user can forward-declare something that has no definition. The + // runtime doesn't prohibit this at all. This is a rare and very weird + // case. Assert assert in debug builds so we catch other weird cases. + assert(false && "forward declaration without definition"); + LLDB_LOG(GetLog(LLDBLog::Types), + "forward declaration without definition: {0}", name) return ast_ctx.getObjCIdType(); + } return ClangUtil::GetQualType(types.front().GetPointerType()); } else { From f69b6d2c99a10847a2d73c7fcd656d2ae22937ce Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Tue, 28 May 2024 10:36:20 -0700 Subject: [PATCH 38/89] [lldb] Add missing semicolon (NFC) --- .../ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp index 4871c59faefccc..ddaa7a8a597b4f 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp @@ -242,7 +242,7 @@ clang::QualType AppleObjCTypeEncodingParser::BuildObjCObjectPointerType( // case. Assert assert in debug builds so we catch other weird cases. assert(false && "forward declaration without definition"); LLDB_LOG(GetLog(LLDBLog::Types), - "forward declaration without definition: {0}", name) + "forward declaration without definition: {0}", name); return ast_ctx.getObjCIdType(); } From c09787b7d05083791b417c5b97a8cfd6d0874ed9 Mon Sep 17 00:00:00 2001 From: Franklin Zhang Date: Wed, 29 May 2024 01:39:35 +0800 Subject: [PATCH 39/89] [OMPT] Set default values for tsan function pointers (#93568) Avoid calling NULL function pointers in cases where ompt_start_tool succeeds but those tsan functions do not really exist. Fix https://github.com/llvm/llvm-project/issues/93524 --------- Co-authored-by: Joachim --- openmp/tools/archer/ompt-tsan.cpp | 33 +++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp index de77e25db2d399..d7658077e83ae0 100644 --- a/openmp/tools/archer/ompt-tsan.cpp +++ b/openmp/tools/archer/ompt-tsan.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -29,7 +30,6 @@ #include #include #include -#include #include "omp-tools.h" @@ -146,18 +146,28 @@ void __attribute__((weak)) __tsan_flush_memory() {} static ArcherFlags *archer_flags; #ifndef TsanHappensBefore + +template static void __ompt_tsan_func(Args...) {} + +#define DECLARE_TSAN_FUNCTION(name, ...) \ + static void (*name)(__VA_ARGS__) = __ompt_tsan_func<__VA_ARGS__>; + // Thread Sanitizer is a tool that finds races in code. // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations . // tsan detects these exact functions by name. extern "C" { -static void (*AnnotateHappensAfter)(const char *, int, const volatile void *); -static void (*AnnotateHappensBefore)(const char *, int, const volatile void *); -static void (*AnnotateIgnoreWritesBegin)(const char *, int); -static void (*AnnotateIgnoreWritesEnd)(const char *, int); -static void (*AnnotateNewMemory)(const char *, int, const volatile void *, - size_t); -static void (*__tsan_func_entry)(const void *); -static void (*__tsan_func_exit)(void); +DECLARE_TSAN_FUNCTION(AnnotateHappensAfter, const char *, int, + const volatile void *) +DECLARE_TSAN_FUNCTION(AnnotateHappensBefore, const char *, int, + const volatile void *) +DECLARE_TSAN_FUNCTION(AnnotateIgnoreWritesBegin, const char *, int) +DECLARE_TSAN_FUNCTION(AnnotateIgnoreWritesEnd, const char *, int) +DECLARE_TSAN_FUNCTION(AnnotateNewMemory, const char *, int, + const volatile void *, size_t) +DECLARE_TSAN_FUNCTION(__tsan_func_entry, const void *) +DECLARE_TSAN_FUNCTION(__tsan_func_exit) + +// RunningOnValgrind is used to detect absence of TSan and must intentionally be a nullptr. static int (*RunningOnValgrind)(void); } @@ -1142,7 +1152,10 @@ static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id, #define findTsanFunction(f, fSig) \ do { \ - if (NULL == (f = fSig dlsym(RTLD_DEFAULT, #f))) \ + void *fp = dlsym(RTLD_DEFAULT, #f); \ + if (fp) \ + f = fSig fp; \ + else \ printf("Unable to find TSan function " #f ".\n"); \ } while (0) From ef67f31e88dbae46811f03da945cfb8130c6fa15 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 28 May 2024 10:51:21 -0700 Subject: [PATCH 40/89] [SCEV] Compute symbolic max backedge taken count in BTI directly. (NFC) Move symbolic max backedge taken count computation to BackedgeTakenInfo, use existing ExitNotTaken info. In preparation for https://github.com/llvm/llvm-project/pull/93498. --- llvm/include/llvm/Analysis/ScalarEvolution.h | 5 -- llvm/lib/Analysis/ScalarEvolution.cpp | 48 +++++++++----------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 5828cc156cc785..1d016b28347d27 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1761,11 +1761,6 @@ class ScalarEvolution { ExitLimit computeExitLimit(const Loop *L, BasicBlock *ExitingBlock, bool AllowPredicates = false); - /// Return a symbolic upper bound for the backedge taken count of the loop. - /// This is more general than getConstantMaxBackedgeTakenCount as it returns - /// an arbitrary expression as opposed to only constants. - const SCEV *computeSymbolicMaxBackedgeTakenCount(const Loop *L); - // Helper functions for computeExitLimitFromCond to avoid exponential time // complexity. diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 8d971e6a78e420..bb56b41fe15d58 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -8647,8 +8647,28 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const { const SCEV * ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L, ScalarEvolution *SE) { - if (!SymbolicMax) - SymbolicMax = SE->computeSymbolicMaxBackedgeTakenCount(L); + if (!SymbolicMax) { + // Form an expression for the maximum exit count possible for this loop. We + // merge the max and exact information to approximate a version of + // getConstantMaxBackedgeTakenCount which isn't restricted to just + // constants. + SmallVector ExitCounts; + + for (const auto &ENT : ExitNotTaken) { + const SCEV *ExitCount = ENT.SymbolicMaxNotTaken; + if (!isa(ExitCount)) { + assert(SE->DT.dominates(ENT.ExitingBlock, L->getLoopLatch()) && + "We should only have known counts for exiting blocks that " + "dominate latch!"); + ExitCounts.push_back(ExitCount); + } + } + if (ExitCounts.empty()) + SymbolicMax = SE->getCouldNotCompute(); + else + SymbolicMax = + SE->getUMinFromMismatchedTypes(ExitCounts, /*Sequential*/ true); + } return SymbolicMax; } @@ -14964,30 +14984,6 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS, return false; } -const SCEV * -ScalarEvolution::computeSymbolicMaxBackedgeTakenCount(const Loop *L) { - SmallVector ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); - - // Form an expression for the maximum exit count possible for this loop. We - // merge the max and exact information to approximate a version of - // getConstantMaxBackedgeTakenCount which isn't restricted to just constants. - SmallVector ExitCounts; - for (BasicBlock *ExitingBB : ExitingBlocks) { - const SCEV *ExitCount = - getExitCount(L, ExitingBB, ScalarEvolution::SymbolicMaximum); - if (!isa(ExitCount)) { - assert(DT.dominates(ExitingBB, L->getLoopLatch()) && - "We should only have known counts for exiting blocks that " - "dominate latch!"); - ExitCounts.push_back(ExitCount); - } - } - if (ExitCounts.empty()) - return getCouldNotCompute(); - return getUMinFromMismatchedTypes(ExitCounts, /*Sequential*/ true); -} - /// A rewriter to replace SCEV expressions in Map with the corresponding entry /// in the map. It skips AddRecExpr because we cannot guarantee that the /// replacement is loop invariant in the loop of the AddRec. From 0b2094c4553a63bb058c59073fc7c22d05e66977 Mon Sep 17 00:00:00 2001 From: Sirraide Date: Tue, 28 May 2024 19:56:04 +0200 Subject: [PATCH 41/89] [Clang] [NFC] Remove debug printing --- clang/lib/Sema/SemaStmtAttr.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index 82373fe96a8243..6f538ed55cb72e 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -684,10 +684,8 @@ ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A, } if (!getLangOpts().CPlusPlus23 && - A.getSyntax() == AttributeCommonInfo::AS_CXX11) { - llvm::dbgs() << "Syntax: " << int(A.getSyntax()) << "\n"; + A.getSyntax() == AttributeCommonInfo::AS_CXX11) Diag(A.getLoc(), diag::ext_cxx23_attr) << A << Range; - } return Assumption; } From f0899964e4041b1dc70dc66450a7f6b3e3a22262 Mon Sep 17 00:00:00 2001 From: Youngsuk Kim Date: Tue, 28 May 2024 13:59:49 -0400 Subject: [PATCH 42/89] [clang][Sema] Don't emit 'declared here' note for builtin functions with no decl in source (#93394) Fixes #93369 --------- Co-authored-by: Timm Baeder Co-authored-by: S. B. Tam --- clang/docs/ReleaseNotes.rst | 3 +++ clang/lib/Sema/SemaLookup.cpp | 10 ++++++++++ clang/test/SemaCXX/invalid-if-constexpr.cpp | 3 +-- clang/test/SemaCXX/typo-correction-builtin-func.cpp | 8 ++++++++ 4 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 clang/test/SemaCXX/typo-correction-builtin-func.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 173e61fbf7b2c1..894f6b04431744 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -541,6 +541,9 @@ Improvements to Clang's diagnostics - Clang emits a ``-Wparentheses`` warning for expressions with consecutive comparisons like ``x < y < z``. Fixes #GH20456. +- Clang no longer emits a "declared here" note for a builtin function that has no declaration in source. + Fixes #GH93369. + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index ef0a655b631ab4..be6ea20a956a39 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -5897,6 +5897,16 @@ void Sema::diagnoseTypo(const TypoCorrection &Correction, NamedDecl *ChosenDecl = Correction.isKeyword() ? nullptr : Correction.getFoundDecl(); + + // For builtin functions which aren't declared anywhere in source, + // don't emit the "declared here" note. + if (const auto *FD = dyn_cast_if_present(ChosenDecl); + FD && FD->getBuiltinID() && + PrevNote.getDiagID() == diag::note_previous_decl && + Correction.getCorrectionRange().getBegin() == FD->getBeginLoc()) { + ChosenDecl = nullptr; + } + if (PrevNote.getDiagID() && ChosenDecl) Diag(ChosenDecl->getLocation(), PrevNote) << CorrectedQuotedStr << (ErrorRecovery ? FixItHint() : FixTypo); diff --git a/clang/test/SemaCXX/invalid-if-constexpr.cpp b/clang/test/SemaCXX/invalid-if-constexpr.cpp index 7643c47488f057..0007f2739cbbd0 100644 --- a/clang/test/SemaCXX/invalid-if-constexpr.cpp +++ b/clang/test/SemaCXX/invalid-if-constexpr.cpp @@ -4,8 +4,7 @@ namespace GH61885 { void similar() { // expected-note {{'similar' declared here}} if constexpr (similer<>) {} // expected-error {{use of undeclared identifier 'similer'; did you mean 'similar'?}} } -void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'; did you mean '__sync_swap'?}} \ - // expected-note {{'__sync_swap' declared here}} +void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'; did you mean '__sync_swap'?}} int AA() { return true;} // expected-note {{'AA' declared here}} diff --git a/clang/test/SemaCXX/typo-correction-builtin-func.cpp b/clang/test/SemaCXX/typo-correction-builtin-func.cpp new file mode 100644 index 00000000000000..8d369034d1be33 --- /dev/null +++ b/clang/test/SemaCXX/typo-correction-builtin-func.cpp @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +// Test that clang does not emit 'declared here' note for builtin functions that don't have a declaration in source. + +void t0() { + constexpr float A = __builtin_isinfinity(); // expected-error {{use of undeclared identifier '__builtin_isinfinity'; did you mean '__builtin_isfinite'?}} + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} +} From 73e22ff3d77db72bb9b6e22342417a5f4fe6afb4 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 28 May 2024 11:05:38 -0700 Subject: [PATCH 43/89] [Reassociate] Preserve NSW flags after expr tree rewriting (#93105) We can guarantee NSW on all operands in a reassociated add expression tree when: - All adds in an add operator tree are NSW, AND either - All add operands are guaranteed to be nonnegative, OR - All adds are also NUW - Alive2: - Nonnegative Operands - 3 operands: https://alive2.llvm.org/ce/z/G4XW6Q - 4 operands: https://alive2.llvm.org/ce/z/FWcZ6D - NUW NSW adds: https://alive2.llvm.org/ce/z/vRUxeC --------- Co-authored-by: Nikita Popov --- .../llvm/Transforms/Scalar/Reassociate.h | 12 ++- llvm/lib/Transforms/Scalar/Reassociate.cpp | 35 +++++--- llvm/test/Transforms/Reassociate/local-cse.ll | 40 +++++----- .../Transforms/Reassociate/reassoc-add-nsw.ll | 79 +++++++++++++++++++ 4 files changed, 132 insertions(+), 34 deletions(-) create mode 100644 llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll diff --git a/llvm/include/llvm/Transforms/Scalar/Reassociate.h b/llvm/include/llvm/Transforms/Scalar/Reassociate.h index f3a2e0f4380eb0..84d72df6fc4d81 100644 --- a/llvm/include/llvm/Transforms/Scalar/Reassociate.h +++ b/llvm/include/llvm/Transforms/Scalar/Reassociate.h @@ -63,6 +63,16 @@ struct Factor { Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} }; +struct OverflowTracking { + bool HasNUW; + bool HasNSW; + bool AllKnownNonNegative; + // Note: AllKnownNonNegative can be true in a case where one of the operands + // is negative, but one the operators is not NSW. AllKnownNonNegative should + // not be used independently of HasNSW + OverflowTracking() : HasNUW(true), HasNSW(true), AllKnownNonNegative(true) {} +}; + class XorOpnd; } // end namespace reassociate @@ -103,7 +113,7 @@ class ReassociatePass : public PassInfoMixin { void ReassociateExpression(BinaryOperator *I); void RewriteExprTree(BinaryOperator *I, SmallVectorImpl &Ops, - bool HasNUW); + reassociate::OverflowTracking Flags); Value *OptimizeExpression(BinaryOperator *I, SmallVectorImpl &Ops); Value *OptimizeAdd(Instruction *I, diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index d91320863e241d..c903e47a93cafd 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -471,7 +471,7 @@ using RepeatedValue = std::pair; static bool LinearizeExprTree(Instruction *I, SmallVectorImpl &Ops, ReassociatePass::OrderedSet &ToRedo, - bool &HasNUW) { + reassociate::OverflowTracking &Flags) { assert((isa(I) || isa(I)) && "Expected a UnaryOperator or BinaryOperator!"); LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); @@ -512,6 +512,7 @@ static bool LinearizeExprTree(Instruction *I, using LeafMap = DenseMap; LeafMap Leaves; // Leaf -> Total weight so far. SmallVector LeafOrder; // Ensure deterministic leaf output order. + const DataLayout DL = I->getModule()->getDataLayout(); #ifndef NDEBUG SmallPtrSet Visited; // For checking the iteration scheme. @@ -520,8 +521,10 @@ static bool LinearizeExprTree(Instruction *I, std::pair P = Worklist.pop_back_val(); I = P.first; // We examine the operands of this binary operator. - if (isa(I)) - HasNUW &= I->hasNoUnsignedWrap(); + if (isa(I)) { + Flags.HasNUW &= I->hasNoUnsignedWrap(); + Flags.HasNSW &= I->hasNoSignedWrap(); + } for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands. Value *Op = I->getOperand(OpIdx); @@ -648,6 +651,8 @@ static bool LinearizeExprTree(Instruction *I, // Ensure the leaf is only output once. It->second = 0; Ops.push_back(std::make_pair(V, Weight)); + if (Opcode == Instruction::Add && Flags.AllKnownNonNegative && Flags.HasNSW) + Flags.AllKnownNonNegative &= isKnownNonNegative(V, SimplifyQuery(DL)); } // For nilpotent operations or addition there may be no operands, for example @@ -666,7 +671,7 @@ static bool LinearizeExprTree(Instruction *I, /// linearized and optimized, emit them in-order. void ReassociatePass::RewriteExprTree(BinaryOperator *I, SmallVectorImpl &Ops, - bool HasNUW) { + OverflowTracking Flags) { assert(Ops.size() > 1 && "Single values should be used directly!"); // Since our optimizations should never increase the number of operations, the @@ -834,8 +839,12 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, // Note that it doesn't hold for mul if one of the operands is zero. // TODO: We can preserve NUW flag if we prove that all mul operands // are non-zero. - if (HasNUW && ExpressionChangedStart->getOpcode() == Instruction::Add) - ExpressionChangedStart->setHasNoUnsignedWrap(); + if (ExpressionChangedStart->getOpcode() == Instruction::Add) { + if (Flags.HasNUW) + ExpressionChangedStart->setHasNoUnsignedWrap(); + if (Flags.HasNSW && (Flags.AllKnownNonNegative || Flags.HasNUW)) + ExpressionChangedStart->setHasNoSignedWrap(); + } } } @@ -1192,8 +1201,8 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { return nullptr; SmallVector Tree; - bool HasNUW = true; - MadeChange |= LinearizeExprTree(BO, Tree, RedoInsts, HasNUW); + OverflowTracking Flags; + MadeChange |= LinearizeExprTree(BO, Tree, RedoInsts, Flags); SmallVector Factors; Factors.reserve(Tree.size()); for (unsigned i = 0, e = Tree.size(); i != e; ++i) { @@ -1235,7 +1244,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { if (!FoundFactor) { // Make sure to restore the operands to the expression tree. - RewriteExprTree(BO, Factors, HasNUW); + RewriteExprTree(BO, Factors, Flags); return nullptr; } @@ -1247,7 +1256,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { RedoInsts.insert(BO); V = Factors[0].Op; } else { - RewriteExprTree(BO, Factors, HasNUW); + RewriteExprTree(BO, Factors, Flags); V = BO; } @@ -2373,8 +2382,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { // First, walk the expression tree, linearizing the tree, collecting the // operand information. SmallVector Tree; - bool HasNUW = true; - MadeChange |= LinearizeExprTree(I, Tree, RedoInsts, HasNUW); + OverflowTracking Flags; + MadeChange |= LinearizeExprTree(I, Tree, RedoInsts, Flags); SmallVector Ops; Ops.reserve(Tree.size()); for (const RepeatedValue &E : Tree) @@ -2567,7 +2576,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { dbgs() << '\n'); // Now that we ordered and optimized the expressions, splat them back into // the expression tree, removing any unneeded nodes. - RewriteExprTree(I, Ops, HasNUW); + RewriteExprTree(I, Ops, Flags); } void diff --git a/llvm/test/Transforms/Reassociate/local-cse.ll b/llvm/test/Transforms/Reassociate/local-cse.ll index 4d0467e263f553..d0d609f022b46b 100644 --- a/llvm/test/Transforms/Reassociate/local-cse.ll +++ b/llvm/test/Transforms/Reassociate/local-cse.ll @@ -26,16 +26,16 @@ define void @chain_spanning_several_blocks(i64 %inv1, i64 %inv2, i64 %inv3, i64 ; LOCAL_CSE-LABEL: define void @chain_spanning_several_blocks ; LOCAL_CSE-SAME: (i64 [[INV1:%.*]], i64 [[INV2:%.*]], i64 [[INV3:%.*]], i64 [[INV4:%.*]], i64 [[INV5:%.*]]) { ; LOCAL_CSE-NEXT: bb1: -; LOCAL_CSE-NEXT: [[CHAIN_A0:%.*]] = add nuw i64 [[INV2]], [[INV1]] +; LOCAL_CSE-NEXT: [[CHAIN_A0:%.*]] = add nuw nsw i64 [[INV2]], [[INV1]] ; LOCAL_CSE-NEXT: br label [[BB2:%.*]] ; LOCAL_CSE: bb2: ; LOCAL_CSE-NEXT: [[VAL_BB2:%.*]] = call i64 @get_val() -; LOCAL_CSE-NEXT: [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV4]] -; LOCAL_CSE-NEXT: [[CHAIN_A2:%.*]] = add nuw i64 [[CHAIN_A1]], [[VAL_BB2]] -; LOCAL_CSE-NEXT: [[CHAIN_B1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV5]] -; LOCAL_CSE-NEXT: [[CHAIN_B2:%.*]] = add nuw i64 [[CHAIN_B1]], [[VAL_BB2]] -; LOCAL_CSE-NEXT: [[CHAIN_C0:%.*]] = add nuw i64 [[INV3]], [[INV1]] -; LOCAL_CSE-NEXT: [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_C0]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV4]] +; LOCAL_CSE-NEXT: [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_B1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV5]] +; LOCAL_CSE-NEXT: [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_B1]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_C0:%.*]] = add nuw nsw i64 [[INV3]], [[INV1]] +; LOCAL_CSE-NEXT: [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_C0]], [[VAL_BB2]] ; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_A2]]) ; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_B2]]) ; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_C1]]) @@ -47,11 +47,11 @@ define void @chain_spanning_several_blocks(i64 %inv1, i64 %inv2, i64 %inv3, i64 ; CSE-NEXT: br label [[BB2:%.*]] ; CSE: bb2: ; CSE-NEXT: [[VAL_BB2:%.*]] = call i64 @get_val() -; CSE-NEXT: [[CHAIN_A0:%.*]] = add nuw i64 [[VAL_BB2]], [[INV1]] -; CSE-NEXT: [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV2]] +; CSE-NEXT: [[CHAIN_A0:%.*]] = add nuw nsw i64 [[VAL_BB2]], [[INV1]] +; CSE-NEXT: [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV2]] ; CSE-NEXT: [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV4]] ; CSE-NEXT: [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV5]] -; CSE-NEXT: [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV3]] +; CSE-NEXT: [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV3]] ; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_A2]]) ; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_B2]]) ; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_C1]]) @@ -90,19 +90,19 @@ define void @chain_spanning_several_blocks_no_entry_anchor() { ; LOCAL_CSE-NEXT: br label [[BB1:%.*]] ; LOCAL_CSE: bb1: ; LOCAL_CSE-NEXT: [[INV1_BB1:%.*]] = call i64 @get_val() -; LOCAL_CSE-NEXT: [[CHAIN_A0:%.*]] = add nuw i64 [[INV1_BB1]], [[INV2_BB0]] +; LOCAL_CSE-NEXT: [[CHAIN_A0:%.*]] = add nuw nsw i64 [[INV1_BB1]], [[INV2_BB0]] ; LOCAL_CSE-NEXT: br label [[BB2:%.*]] ; LOCAL_CSE: bb2: ; LOCAL_CSE-NEXT: [[INV3_BB2:%.*]] = call i64 @get_val() ; LOCAL_CSE-NEXT: [[INV4_BB2:%.*]] = call i64 @get_val() ; LOCAL_CSE-NEXT: [[INV5_BB2:%.*]] = call i64 @get_val() ; LOCAL_CSE-NEXT: [[VAL_BB2:%.*]] = call i64 @get_val() -; LOCAL_CSE-NEXT: [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV4_BB2]] -; LOCAL_CSE-NEXT: [[CHAIN_A2:%.*]] = add nuw i64 [[CHAIN_A1]], [[VAL_BB2]] -; LOCAL_CSE-NEXT: [[CHAIN_B1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV5_BB2]] -; LOCAL_CSE-NEXT: [[CHAIN_B2:%.*]] = add nuw i64 [[CHAIN_B1]], [[VAL_BB2]] -; LOCAL_CSE-NEXT: [[CHAIN_C0:%.*]] = add nuw i64 [[VAL_BB2]], [[INV1_BB1]] -; LOCAL_CSE-NEXT: [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_C0]], [[INV3_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV4_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_B1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV5_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_B1]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_C0:%.*]] = add nuw nsw i64 [[VAL_BB2]], [[INV1_BB1]] +; LOCAL_CSE-NEXT: [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_C0]], [[INV3_BB2]] ; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_A2]]) ; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_B2]]) ; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_C1]]) @@ -120,11 +120,11 @@ define void @chain_spanning_several_blocks_no_entry_anchor() { ; CSE-NEXT: [[INV4_BB2:%.*]] = call i64 @get_val() ; CSE-NEXT: [[INV5_BB2:%.*]] = call i64 @get_val() ; CSE-NEXT: [[VAL_BB2:%.*]] = call i64 @get_val() -; CSE-NEXT: [[CHAIN_A0:%.*]] = add nuw i64 [[VAL_BB2]], [[INV1_BB1]] -; CSE-NEXT: [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV2_BB0]] +; CSE-NEXT: [[CHAIN_A0:%.*]] = add nuw nsw i64 [[VAL_BB2]], [[INV1_BB1]] +; CSE-NEXT: [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV2_BB0]] ; CSE-NEXT: [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV4_BB2]] ; CSE-NEXT: [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV5_BB2]] -; CSE-NEXT: [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV3_BB2]] +; CSE-NEXT: [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV3_BB2]] ; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_A2]]) ; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_B2]]) ; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_C1]]) diff --git a/llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll b/llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll new file mode 100644 index 00000000000000..fcebc4980e6d7d --- /dev/null +++ b/llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=reassociate -S | FileCheck %s +define i32 @nsw_preserve_nonnegative(ptr %ptr0, ptr %ptr1, ptr %ptr2) { +; CHECK-LABEL: define i32 @nsw_preserve_nonnegative( +; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) { +; CHECK-NEXT: [[V0:%.*]] = load i32, ptr [[PTR0]], align 4, !range [[RNG0:![0-9]+]] +; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[PTR1]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[PTR2]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[ADD0:%.*]] = add nsw i32 [[V1]], [[V0]] +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[ADD0]], [[V2]] +; CHECK-NEXT: ret i32 [[ADD1]] +; + %v0 = load i32, ptr %ptr0, !range !1 + %v1 = load i32, ptr %ptr1, !range !1 + %v2 = load i32, ptr %ptr2, !range !1 + %add0 = add nsw i32 %v1, %v2 + %add1 = add nsw i32 %add0, %v0 + ret i32 %add1 +} + +define i32 @nsw_preserve_nuw_nsw(ptr %ptr0, ptr %ptr1, ptr %ptr2) { +; CHECK-LABEL: define i32 @nsw_preserve_nuw_nsw( +; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) { +; CHECK-NEXT: [[V0:%.*]] = load i32, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[PTR1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[PTR2]], align 4 +; CHECK-NEXT: [[ADD0:%.*]] = add nuw nsw i32 [[V1]], [[V0]] +; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i32 [[ADD0]], [[V2]] +; CHECK-NEXT: ret i32 [[ADD1]] +; + %v0 = load i32, ptr %ptr0 + %v1 = load i32, ptr %ptr1 + %v2 = load i32, ptr %ptr2 + %add0 = add nuw nsw i32 %v1, %v2 + %add1 = add nuw nsw i32 %add0, %v0 + ret i32 %add1 +} + +define i32 @nsw_dont_preserve_negative(ptr %ptr0, ptr %ptr1, ptr %ptr2) { +; CHECK-LABEL: define i32 @nsw_dont_preserve_negative( +; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) { +; CHECK-NEXT: [[V0:%.*]] = load i32, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[PTR1]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[PTR2]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[ADD0:%.*]] = add i32 [[V1]], [[V0]] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[ADD0]], [[V2]] +; CHECK-NEXT: ret i32 [[ADD1]] +; + %v0 = load i32, ptr %ptr0 + %v1 = load i32, ptr %ptr1, !range !1 + %v2 = load i32, ptr %ptr2, !range !1 + %add0 = add nsw i32 %v1, %v2 + %add1 = add nsw i32 %add0, %v0 + ret i32 %add1 +} + +define i32 @nsw_nopreserve_notallnsw(ptr %ptr0, ptr %ptr1, ptr %ptr2) { +; CHECK-LABEL: define i32 @nsw_nopreserve_notallnsw( +; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) { +; CHECK-NEXT: [[V0:%.*]] = load i32, ptr [[PTR0]], align 4, !range [[RNG0:![0-9]+]] +; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[PTR1]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[PTR2]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[ADD0:%.*]] = add i32 [[V1]], [[V0]] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[ADD0]], [[V2]] +; CHECK-NEXT: ret i32 [[ADD1]] +; + %v0 = load i32, ptr %ptr0, !range !1 + %v1 = load i32, ptr %ptr1, !range !1 + %v2 = load i32, ptr %ptr2, !range !1 + %add0 = add nsw i32 %v1, %v2 + %add1 = add i32 %add0, %v0 + ret i32 %add1 +} + +; Positive 32 bit integers +!1 = !{i32 0, i32 2147483648} +;. +; CHECK: [[RNG0]] = !{i32 0, i32 -2147483648} +;. From 99835922ca2a2ac20769271a49a5f8055bb5dc93 Mon Sep 17 00:00:00 2001 From: Peiming Liu Date: Tue, 28 May 2024 11:23:15 -0700 Subject: [PATCH 44/89] [mlir][sparse] remove sparse encoding propagation pass. (#93593) --- .../Dialect/SparseTensor/Transforms/Passes.h | 6 ---- .../Dialect/SparseTensor/Transforms/Passes.td | 36 ------------------- .../Transforms/SparseTensorPasses.cpp | 13 ------- 3 files changed, 55 deletions(-) diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h index bb49d6c256f21b..d6d038ef65bdf4 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -65,12 +65,6 @@ void populateSparseAssembler(RewritePatternSet &patterns, bool directOut); std::unique_ptr createSparseAssembler(); std::unique_ptr createSparseAssembler(bool directOut); -//===----------------------------------------------------------------------===// -// The SparseEncodingPropagation pass. -//===----------------------------------------------------------------------===// - -std::unique_ptr createSparseEncodingPropagationPass(); - //===----------------------------------------------------------------------===// // The SparseReinterpretMap pass. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td index 94c3ca60030eeb..2f844cee5ff528 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td @@ -40,42 +40,6 @@ def SparseAssembler : Pass<"sparse-assembler", "ModuleOp"> { ]; } -def SparseEncodingPropagation : Pass<"sparse-encoding-propagation", "func::FuncOp"> { - let summary = "Propagate sparse tensor encodings"; - let description = [{ - A pass that propagates sparse tensor encodings. - - Background: To avoid introducing repetitive operations, sparse tensors - in MLIR try to reuse tensor operations whenever available. However, most - tensor operations are canonicalized/transformed without the knowledge - of sparsity. The pass tries to propagate missing sparse encodings. - - For example: - ```mlir - %s = tensor.extract_slice %input[0, 0,] [2, 1] [1, 1] - : tensor<2x3xf32, #sparse> to tensor<2x1xf32, #sparse> - - // After rank reducing (by tensor dialect transformation) - %t = tensor.extract_slice %input[0, 0,] [2, 1] [1, 1] - : tensor<2x3xf32, #sparse> to tensor<2xf32> - %s = tensor.expand_shape [[0, 1]] %t - : tensor<2xf32> to tensor<2x1xf32, #sparse> - - // After sparsity propagation - %t = tensor.extract_slice %input[0, 0,] [2, 1] [1, 1] - : tensor<2x3xf32, #sparse> to tensor<2xf32, #sparse1> - %s = tensor.expand_shape [[0, 1]] %t - : tensor<2xf32, #sparse1> to tensor<2x1xf32, #sparse> - ``` - }]; - - let constructor = "mlir::createSparseEncodingPropagationPass()"; - let dependentDialects = [ - "sparse_tensor::SparseTensorDialect", - "tensor::TensorDialect", - ]; -} - def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> { let summary = "Reinterprets sparse tensor type mappings"; let description = [{ diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp index f57353b5892b5a..b42d58634a36c4 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp @@ -23,7 +23,6 @@ namespace mlir { #define GEN_PASS_DEF_SPARSEASSEMBLER -#define GEN_PASS_DEF_SPARSEENCODINGPROPAGATION #define GEN_PASS_DEF_SPARSEREINTERPRETMAP #define GEN_PASS_DEF_PRESPARSIFICATIONREWRITE #define GEN_PASS_DEF_SPARSIFICATIONPASS @@ -61,14 +60,6 @@ struct SparseAssembler : public impl::SparseAssemblerBase { } }; -struct SparseEncodingPropagation - : public impl::SparseEncodingPropagationBase { - SparseEncodingPropagation() = default; - SparseEncodingPropagation(const SparseEncodingPropagation &pass) = default; - - void runOnOperation() override {} -}; - struct SparseReinterpretMap : public impl::SparseReinterpretMapBase { SparseReinterpretMap() = default; @@ -407,10 +398,6 @@ std::unique_ptr mlir::createSparseAssembler() { return std::make_unique(); } -std::unique_ptr mlir::createSparseEncodingPropagationPass() { - return std::make_unique(); -} - std::unique_ptr mlir::createSparseReinterpretMapPass() { return std::make_unique(); } From 196a0809822ba4ac0fc669a46cbacee8afbe36c2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 28 May 2024 20:27:07 +0200 Subject: [PATCH 45/89] DAG: Handle fminnum_ieee/fmaxnum_ieee in basic legalization Handle these in promote float and vector widening. Currently we happen to avoid emitting these unless legal or custom. Avoids regression in a future commit which wants to unconditionally emit these. --- llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp | 2 ++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index fc96ecdc662808..fb1424f75e097d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2488,6 +2488,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::FMINIMUM: case ISD::FMAXNUM: case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: case ISD::FMUL: case ISD::FPOW: case ISD::FREM: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 40e621f0db2209..14e8708fd3f38f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4237,8 +4237,12 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SHL: case ISD::VP_SHL: case ISD::SRA: case ISD::VP_SRA: case ISD::SRL: case ISD::VP_SRL: - case ISD::FMINNUM: case ISD::VP_FMINNUM: - case ISD::FMAXNUM: case ISD::VP_FMAXNUM: + case ISD::FMINNUM: + case ISD::FMINNUM_IEEE: + case ISD::VP_FMINNUM: + case ISD::FMAXNUM: + case ISD::FMAXNUM_IEEE: + case ISD::VP_FMAXNUM: case ISD::FMINIMUM: case ISD::VP_FMINIMUM: case ISD::FMAXIMUM: From 08de0b3cf54e4998799673f835e9a7d5ead8efab Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 28 May 2024 11:36:48 -0700 Subject: [PATCH 46/89] [WebAssembly] Add tests for EH/SjLj option errors (#93583) This adds tests for EH/SjLj option errors and swaps the error checking order for unimportant cosmetic reasons (I think checking EH/SjLj conflicts is more important than the model checking) --- .../WebAssembly/WebAssemblyTargetMachine.cpp | 34 ++++++++++--------- .../CodeGen/WebAssembly/eh-option-errors.ll | 19 +++++++++++ .../WebAssembly/lower-em-ehsjlj-options.ll | 3 -- 3 files changed, 37 insertions(+), 19 deletions(-) create mode 100644 llvm/test/CodeGen/WebAssembly/eh-option-errors.ll diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index de342e89657367..68126992ddcd72 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -388,15 +388,29 @@ using WebAssembly::WasmEnableEmSjLj; using WebAssembly::WasmEnableSjLj; static void basicCheckForEHAndSjLj(TargetMachine *TM) { - // Before checking, we make sure TargetOptions.ExceptionModel is the same as + + // You can't enable two modes of EH at the same time + if (WasmEnableEmEH && WasmEnableEH) + report_fatal_error( + "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh"); + // You can't enable two modes of SjLj at the same time + if (WasmEnableEmSjLj && WasmEnableSjLj) + report_fatal_error( + "-enable-emscripten-sjlj not allowed with -wasm-enable-sjlj"); + // You can't mix Emscripten EH with Wasm SjLj. + if (WasmEnableEmEH && WasmEnableSjLj) + report_fatal_error( + "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj"); + + // Here we make sure TargetOptions.ExceptionModel is the same as // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang // stores the exception model info in LangOptions, which is later transferred // to TargetOptions and MCAsmInfo. But when clang compiles bitcode directly, // clang's LangOptions is not used and thus the exception model info is not // correctly transferred to TargetOptions and MCAsmInfo, so we make sure we - // have the correct exception model in WebAssemblyMCAsmInfo constructor. - // But in this case TargetOptions is still not updated, so we make sure they - // are the same. + // have the correct exception model in WebAssemblyMCAsmInfo constructor. But + // in this case TargetOptions is still not updated, so we make sure they are + // the same. TM->Options.ExceptionModel = TM->getMCAsmInfo()->getExceptionHandlingType(); // Basic Correctness checking related to -exception-model @@ -418,18 +432,6 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) { "-exception-model=wasm only allowed with at least one of " "-wasm-enable-eh or -wasm-enable-sjlj"); - // You can't enable two modes of EH at the same time - if (WasmEnableEmEH && WasmEnableEH) - report_fatal_error( - "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh"); - // You can't enable two modes of SjLj at the same time - if (WasmEnableEmSjLj && WasmEnableSjLj) - report_fatal_error( - "-enable-emscripten-sjlj not allowed with -wasm-enable-sjlj"); - // You can't mix Emscripten EH with Wasm SjLj. - if (WasmEnableEmEH && WasmEnableSjLj) - report_fatal_error( - "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj"); // Currently it is allowed to mix Wasm EH with Emscripten SjLj as an interim // measure, but some code will error out at compile time in this combination. // See WebAssemblyLowerEmscriptenEHSjLj pass for details. diff --git a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll new file mode 100644 index 00000000000000..74d02ddc405d3f --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll @@ -0,0 +1,19 @@ +target triple = "wasm32-unknown-unknown" + +; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-eh 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_EH +; EM_EH_W_WASM_EH: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh + +; RUN: not --crash llc < %s -enable-emscripten-sjlj -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_SJLJ_W_WASM_SJLJ +; EM_SJLJ_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-sjlj not allowed with -wasm-enable-sjlj + +; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_SJLJ +; EM_EH_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj + +; RUN: not --crash llc < %s -wasm-enable-eh -exception-model=dwarf 2>&1 | FileCheck %s --check-prefix=EH_MODEL_DWARF +; EH_MODEL_DWARF: LLVM ERROR: -exception-model should be either 'none' or 'wasm' + +; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -exception-model=wasm 2>&1 | FileCheck %s --check-prefix=EM_EH_W_MODEL_WASM +; EM_EH_W_MODEL_WASM: LLVM ERROR: -exception-model=wasm not allowed with -enable-emscripten-cxx-exceptions + +; RUN: not --crash llc < %s -exception-model=wasm 2>&1 | FileCheck %s --check-prefix=MODEL_WASM_WO_WASM_EH_SJLJ +; MODEL_WASM_WO_WASM_EH_SJLJ: LLVM ERROR: -exception-model=wasm only allowed with at least one of -wasm-enable-eh or -wasm-enable-sjlj diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll index 4a63c812d6ae9a..66872a54229862 100644 --- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll +++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll @@ -1,7 +1,6 @@ ; RUN: llc < %s -enable-emscripten-cxx-exceptions | FileCheck %s --check-prefix=EH ; RUN: llc < %s -enable-emscripten-sjlj | FileCheck %s --check-prefix=SJLJ ; RUN: llc < %s | FileCheck %s --check-prefix=NONE -; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -exception-model=wasm 2>&1 | FileCheck %s --check-prefix=WASM-EH-EM-EH target triple = "wasm32-unknown-unknown" @@ -97,5 +96,3 @@ declare void @free(ptr) attributes #0 = { returns_twice } attributes #1 = { noreturn } attributes #2 = { nounwind } - -; WASM-EH-EM-EH: LLVM ERROR: -exception-model=wasm not allowed with -enable-emscripten-cxx-exceptions From d33864d5d8ae55ff1c86510dc475fd9dd72d61c7 Mon Sep 17 00:00:00 2001 From: Karthika Devi C Date: Wed, 29 May 2024 00:11:58 +0530 Subject: [PATCH 47/89] [polly] Fix cppcheck SA comment reported in #91235 (#93505) This patch moves the unreachable assert before return statement. Fixes #91235. --- polly/include/polly/ScheduleTreeTransform.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polly/include/polly/ScheduleTreeTransform.h b/polly/include/polly/ScheduleTreeTransform.h index ee504c4e5f5244..6bd5a3abf9ea28 100644 --- a/polly/include/polly/ScheduleTreeTransform.h +++ b/polly/include/polly/ScheduleTreeTransform.h @@ -47,9 +47,9 @@ struct ScheduleTreeVisitor { return getDerived().visitSequence(Node.as(), std::forward(args)...); case isl_schedule_node_set: + assert(isl_schedule_node_n_children(Node.get()) >= 2); return getDerived().visitSet(Node.as(), std::forward(args)...); - assert(isl_schedule_node_n_children(Node.get()) >= 2); case isl_schedule_node_leaf: assert(isl_schedule_node_n_children(Node.get()) == 0); return getDerived().visitLeaf(Node.as(), From 5901d4005f015a46185ddc080038c1a3db3fa2c7 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 28 May 2024 14:55:18 -0400 Subject: [PATCH 48/89] [C] Disallow declarations where a statement is required (#92908) This fixes a regression introduced in 8bd06d5b65845e5e01dd899a2deb773580460b89 where Clang began to accept a declaration where a statement is required. e.g., ``` if (1) int x; // Previously accepted, now properly rejected ``` Fixes #92775 --- clang/docs/ReleaseNotes.rst | 3 ++ clang/include/clang/Parse/Parser.h | 9 +++-- clang/lib/Parse/ParseStmt.cpp | 10 ++++- clang/test/C/C99/block-scopes.c | 3 +- clang/test/Parser/decls.c | 39 +++++++++++++++++++ .../test/SemaOpenACC/parallel-loc-and-stmt.c | 6 ++- 6 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 clang/test/Parser/decls.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 894f6b04431744..9091f6341bd9b8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -632,6 +632,9 @@ Bug Fixes in This Version - ``__is_array`` and ``__is_bounded_array`` no longer return ``true`` for zero-sized arrays. Fixes (#GH54705). +- Correctly reject declarations where a statement is required in C. + Fixes #GH92775 + Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 8493026f5f7a69..00b475e5b42824 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -467,15 +467,18 @@ class Parser : public CodeCompletionHandler { /// Flags describing a context in which we're parsing a statement. enum class ParsedStmtContext { + /// This context permits declarations in language modes where declarations + /// are not statements. + AllowDeclarationsInC = 0x1, /// This context permits standalone OpenMP directives. - AllowStandaloneOpenMPDirectives = 0x1, + AllowStandaloneOpenMPDirectives = 0x2, /// This context is at the top level of a GNU statement expression. - InStmtExpr = 0x2, + InStmtExpr = 0x4, /// The context of a regular substatement. SubStmt = 0, /// The context of a compound-statement. - Compound = AllowStandaloneOpenMPDirectives, + Compound = AllowDeclarationsInC | AllowStandaloneOpenMPDirectives, LLVM_MARK_AS_BITMASK_ENUM(InStmtExpr) }; diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index b0af04451166ca..c25203243ee49b 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -239,7 +239,15 @@ StmtResult Parser::ParseStatementOrDeclarationAfterAttributes( auto IsStmtAttr = [](ParsedAttr &Attr) { return Attr.isStmtAttr(); }; bool AllAttrsAreStmtAttrs = llvm::all_of(CXX11Attrs, IsStmtAttr) && llvm::all_of(GNUAttrs, IsStmtAttr); - if (((GNUAttributeLoc.isValid() && !(HaveAttrs && AllAttrsAreStmtAttrs)) || + // In C, the grammar production for statement (C23 6.8.1p1) does not allow + // for declarations, which is different from C++ (C++23 [stmt.pre]p1). So + // in C++, we always allow a declaration, but in C we need to check whether + // we're in a statement context that allows declarations. e.g., in C, the + // following is invalid: if (1) int x; + if ((getLangOpts().CPlusPlus || getLangOpts().MicrosoftExt || + (StmtCtx & ParsedStmtContext::AllowDeclarationsInC) != + ParsedStmtContext()) && + ((GNUAttributeLoc.isValid() && !(HaveAttrs && AllAttrsAreStmtAttrs)) || isDeclarationStatement())) { SourceLocation DeclStart = Tok.getLocation(), DeclEnd; DeclGroupPtrTy Decl; diff --git a/clang/test/C/C99/block-scopes.c b/clang/test/C/C99/block-scopes.c index 589047df3e52bc..116e5d922593e0 100644 --- a/clang/test/C/C99/block-scopes.c +++ b/clang/test/C/C99/block-scopes.c @@ -18,8 +18,9 @@ enum {a, b}; void different(void) { - if (sizeof(enum {b, a}) != sizeof(int)) + if (sizeof(enum {b, a}) != sizeof(int)) { _Static_assert(a == 1, ""); + } /* In C89, the 'b' found here would have been from the enum declaration in * the controlling expression of the selection statement, not from the global * declaration. In C99 and later, that enumeration is scoped to the 'if' diff --git a/clang/test/Parser/decls.c b/clang/test/Parser/decls.c new file mode 100644 index 00000000000000..39ef05bf4bd999 --- /dev/null +++ b/clang/test/Parser/decls.c @@ -0,0 +1,39 @@ +// RUN: %clang_cc1 %s -fsyntax-only -verify -pedantic + +// Test that we can parse declarations at global scope. +int v; + +void func(void) { + // Test that we can parse declarations within a compound statement. + int a; + { + int b; + } + + int z = ({ // expected-warning {{use of GNU statement expression extension}} + // Test that we can parse declarations within a GNU statement expression. + int w = 12; + w; + }); + + // Test that we diagnose declarations where a statement is required. + // See GH92775. + if (1) + int x; // expected-error {{expected expression}} + for (;;) + int c; // expected-error {{expected expression}} + + label: + int y; // expected-warning {{label followed by a declaration is a C23 extension}} + + // Test that lookup works as expected. + (void)a; + (void)v; + (void)z; + (void)b; // expected-error {{use of undeclared identifier 'b'}} + (void)w; // expected-error {{use of undeclared identifier 'w'}} + (void)x; // expected-error {{use of undeclared identifier 'x'}} + (void)c; // expected-error {{use of undeclared identifier 'c'}} + (void)y; +} + diff --git a/clang/test/SemaOpenACC/parallel-loc-and-stmt.c b/clang/test/SemaOpenACC/parallel-loc-and-stmt.c index ba29f6da8ba25d..bbcdd823483a52 100644 --- a/clang/test/SemaOpenACC/parallel-loc-and-stmt.c +++ b/clang/test/SemaOpenACC/parallel-loc-and-stmt.c @@ -33,9 +33,11 @@ int foo3; void func() { // FIXME: Should we disallow this on declarations, or consider this to be on - // the initialization? + // the initialization? This is currently rejected in C because + // Parser::ParseOpenACCDirectiveStmt() calls ParseStatement() and passes the + // statement context as "SubStmt" which does not allow for a declaration in C. #pragma acc parallel - int foo; + int foo; // expected-error {{expected expression}} #pragma acc parallel { From debdbeda15802900615d1bee83e4fc519abeaba6 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Tue, 28 May 2024 20:04:27 +0100 Subject: [PATCH 49/89] [mlir] Remove dialect specific bufferization passes (Reland) (#93535) These passes have been depreciated for a long time and replaced by one-shot bufferization. These passes are also unsafe because they do not check for read-after-write conflicts. Relands https://github.com/llvm/llvm-project/pull/93488 which failed on buildbot. Fixes the failure by updating integration tests to use one-shot-bufferize instead. --- .../mlir/Dialect/Arith/Transforms/Passes.h | 3 - .../mlir/Dialect/Arith/Transforms/Passes.td | 16 ----- .../Dialect/Bufferization/Transforms/Passes.h | 3 - .../Bufferization/Transforms/Passes.td | 5 -- mlir/include/mlir/Dialect/Linalg/Passes.h | 4 -- mlir/include/mlir/Dialect/Linalg/Passes.td | 10 --- .../mlir/Dialect/Shape/Transforms/Passes.h | 7 -- .../mlir/Dialect/Shape/Transforms/Passes.td | 7 -- .../mlir/Dialect/Tensor/Transforms/Passes.h | 3 - .../mlir/Dialect/Tensor/Transforms/Passes.td | 5 -- .../mlir/Dialect/Vector/Transforms/Passes.h | 3 - .../mlir/Dialect/Vector/Transforms/Passes.td | 5 -- .../Dialect/Arith/Transforms/Bufferize.cpp | 67 ------------------- .../Dialect/Arith/Transforms/CMakeLists.txt | 1 - .../Bufferization/Transforms/Bufferize.cpp | 23 ------- .../Dialect/Linalg/Transforms/Bufferize.cpp | 52 -------------- .../Dialect/Linalg/Transforms/CMakeLists.txt | 1 - .../Dialect/Shape/Transforms/Bufferize.cpp | 49 -------------- .../Dialect/Shape/Transforms/CMakeLists.txt | 1 - .../Dialect/Tensor/Transforms/Bufferize.cpp | 58 ---------------- .../Dialect/Tensor/Transforms/CMakeLists.txt | 1 - .../Dialect/Vector/Transforms/Bufferize.cpp | 55 --------------- .../Dialect/Vector/Transforms/CMakeLists.txt | 1 - mlir/test/Dialect/Arith/bufferize.mlir | 8 +-- mlir/test/Dialect/Linalg/bufferize.mlir | 30 +-------- mlir/test/Dialect/Shape/bufferize.mlir | 2 +- .../Dialect/SparseTensor/sparse_lower.mlir | 3 +- .../SparseTensor/sparse_lower_col.mlir | 3 +- .../SparseTensor/sparse_lower_inplace.mlir | 3 +- mlir/test/Dialect/Tensor/bufferize.mlir | 2 +- .../Dialect/Vector/bufferize-invalid.mlir | 3 +- mlir/test/Dialect/Vector/bufferize.mlir | 2 +- .../Dialect/Complex/CPU/correctness.mlir | 2 +- .../Linalg/CPU/test-collapse-tensor.mlir | 6 +- .../Dialect/Linalg/CPU/test-elementwise.mlir | 2 +- .../Linalg/CPU/test-expand-tensor.mlir | 6 +- .../Dialect/Linalg/CPU/test-padtensor.mlir | 3 +- .../test-subtensor-insert-multiple-uses.mlir | 4 +- .../Linalg/CPU/test-subtensor-insert.mlir | 4 +- .../Dialect/Linalg/CPU/test-tensor-e2e.mlir | 5 +- .../Linalg/CPU/test-tensor-matmul.mlir | 10 +-- .../Dialect/Memref/print-memref.mlir | 2 +- .../Dialect/Memref/verify-memref.mlir | 2 +- .../Vector/CPU/AMX/test-mulf-full.mlir | 5 +- .../Vector/CPU/AMX/test-muli-full.mlir | 6 +- 45 files changed, 40 insertions(+), 453 deletions(-) delete mode 100644 mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp delete mode 100644 mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp delete mode 100644 mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp delete mode 100644 mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp delete mode 100644 mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h index cbc6147cb81e22..9dc262cc72ed00 100644 --- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h @@ -24,9 +24,6 @@ namespace arith { class WideIntEmulationConverter; class NarrowTypeEmulationConverter; -/// Create a pass to bufferize arith.constant ops. -std::unique_ptr createConstantBufferizePass(uint64_t alignment = 0); - /// Adds patterns to emulate wide Arith and Function ops over integer /// types into supported ones. This is done by splitting original power-of-two /// i2N integer types into two iN halves. diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td index 4096e309199e98..550c5c0cf4f60f 100644 --- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td @@ -11,22 +11,6 @@ include "mlir/Pass/PassBase.td" -def ArithBufferizePass : Pass<"arith-bufferize", "ModuleOp"> { - let summary = "Bufferize Arith dialect ops."; - let description = [{ - This pass bufferizes arith dialect ops. - - This pass needs to be a module pass because it inserts memref.global - ops into the module, which cannot be done safely from a function pass due to - multi-threading. Most other bufferization passes can run in parallel at - function granularity. - }]; - let options = [ - Option<"alignment", "alignment", "unsigned", /*default=*/"0", - "Create global memrefs with a specified alignment">, - ]; -} - def ArithExpandOpsPass : Pass<"arith-expand"> { let summary = "Legalize Arith ops to be convertible to LLVM."; let dependentDialects = ["vector::VectorDialect"]; diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h index 459c252b707121..e053e6c97e1430 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h @@ -221,9 +221,6 @@ createPromoteBuffersToStackPass(std::function isSmallAlloc); /// insert_slice ops. std::unique_ptr createEmptyTensorEliminationPass(); -/// Create a pass that bufferizes ops from the bufferization dialect. -std::unique_ptr createBufferizationBufferizePass(); - //===----------------------------------------------------------------------===// // Registration //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td index 75ce85c9128c94..8f8826b9ad56b4 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -350,11 +350,6 @@ def FinalizingBufferize : Pass<"finalizing-bufferize", "func::FuncOp"> { let constructor = "mlir::bufferization::createFinalizingBufferizePass()"; } -def BufferizationBufferize : Pass<"bufferization-bufferize", "func::FuncOp"> { - let summary = "Bufferize the `bufferization` dialect"; - let constructor = "mlir::bufferization::createBufferizationBufferizePass()"; -} - def DropEquivalentBufferResults : Pass<"drop-equivalent-buffer-results", "ModuleOp"> { let summary = "Remove MemRef return values that are equivalent to a bbArg"; let description = [{ diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h index d36d1e70f0b14d..f2955d55e59eca 100644 --- a/mlir/include/mlir/Dialect/Linalg/Passes.h +++ b/mlir/include/mlir/Dialect/Linalg/Passes.h @@ -22,10 +22,6 @@ namespace func { class FuncOp; } // namespace func -namespace bufferization { -struct OneShotBufferizationOptions; -} // namespace bufferization - #define GEN_PASS_DECL #include "mlir/Dialect/Linalg/Passes.h.inc" // IWYU pragma: keep diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td index 0a4ce8953136dd..0621a9f33ba1e8 100644 --- a/mlir/include/mlir/Dialect/Linalg/Passes.td +++ b/mlir/include/mlir/Dialect/Linalg/Passes.td @@ -89,16 +89,6 @@ def LinalgInlineScalarOperandsPass : Pass<"linalg-inline-scalar-operands"> { ]; } -def LinalgBufferizePass : Pass<"linalg-bufferize"> { - let summary = "Bufferize the linalg dialect"; - let dependentDialects = [ - "affine::AffineDialect", - "bufferization::BufferizationDialect", - "linalg::LinalgDialect", - "memref::MemRefDialect", - ]; -} - def LinalgGeneralizeNamedOpsPass : Pass<"linalg-generalize-named-ops"> { let summary = "Convert named ops into generic ops"; let dependentDialects = ["linalg::LinalgDialect"]; diff --git a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h index cfb637f133f54c..28e17459ff9625 100644 --- a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h @@ -47,13 +47,6 @@ void populateShapeRewritePatterns(RewritePatternSet &patterns); void populateRemoveShapeConstraintsPatterns(RewritePatternSet &patterns); std::unique_ptr> createRemoveShapeConstraintsPass(); -// Bufferizes shape dialect ops. -// -// Note that most shape dialect ops must be converted to std before -// bufferization happens, as they are intended to be bufferized at the std -// level. -std::unique_ptr> createShapeBufferizePass(); - /// Outline the shape computation part by adding shape.func and populate /// conrresponding mapping infomation into ShapeMappingAnalysis. std::unique_ptr> createOutlineShapeComputationPass(); diff --git a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td index 9dfda9ea336153..83834509b4a35a 100644 --- a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td @@ -103,11 +103,4 @@ def ShapeToShapeLowering : Pass<"shape-to-shape-lowering", "func::FuncOp"> { let constructor = "mlir::createShapeToShapeLowering()"; } -// TODO: Generalize this to allow any type conversions desired. -def ShapeBufferize : Pass<"shape-bufferize", "func::FuncOp"> { - let summary = "Bufferize the shape dialect."; - let constructor = "mlir::createShapeBufferizePass()"; - let dependentDialects = ["bufferization::BufferizationDialect", - "memref::MemRefDialect"]; -} #endif // MLIR_DIALECT_SHAPE_TRANSFORMS_PASSES diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h index 48f9066934a25e..964c35b3f15b80 100644 --- a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h @@ -21,9 +21,6 @@ namespace tensor { /// Creates an instance of the `tensor` subset folding pass. std::unique_ptr createFoldTensorSubsetOpsPass(); -/// Creates an instance of the `tensor` dialect bufferization pass. -std::unique_ptr createTensorBufferizePass(); - //===----------------------------------------------------------------------===// // Registration //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td index 4cc3844f29120b..be4c333836ec07 100644 --- a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td @@ -27,9 +27,4 @@ def FoldTensorSubsetOps : Pass<"fold-tensor-subset-ops"> { ]; } -def TensorBufferize : Pass<"tensor-bufferize", "func::FuncOp"> { - let summary = "Bufferize the `tensor` dialect"; - let constructor = "mlir::tensor::createTensorBufferizePass()"; -} - #endif // MLIR_DIALECT_TENSOR_TRANSFORMS_PASSES diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h index 911402551e14d4..5667f4fa95ace4 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h @@ -17,9 +17,6 @@ namespace vector { #define GEN_PASS_DECL #include "mlir/Dialect/Vector/Transforms/Passes.h.inc" -/// Creates an instance of the `vector` dialect bufferization pass. -std::unique_ptr createVectorBufferizePass(); - /// Creates an instance of the `vector.mask` lowering pass. std::unique_ptr createLowerVectorMaskPass(); diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td index 31a0b3b2f0c53d..74369987497910 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td @@ -11,11 +11,6 @@ include "mlir/Pass/PassBase.td" -def VectorBufferize : Pass<"vector-bufferize", "func::FuncOp"> { - let summary = "Bufferize Vector dialect ops"; - let constructor = "mlir::vector::createVectorBufferizePass()"; -} - def LowerVectorMaskPass : Pass<"lower-vector-mask", "func::FuncOp"> { let summary = "Lower 'vector.mask' operations"; let constructor = "mlir::vector::createLowerVectorMaskPass()"; diff --git a/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp deleted file mode 100644 index 9a066756f429ca..00000000000000 --- a/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp +++ /dev/null @@ -1,67 +0,0 @@ -//===- Bufferize.cpp - Bufferization for Arith ops ---------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Arith/Transforms/Passes.h" - -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" - -namespace mlir { -namespace arith { -#define GEN_PASS_DEF_ARITHBUFFERIZEPASS -#include "mlir/Dialect/Arith/Transforms/Passes.h.inc" -} // namespace arith -} // namespace mlir - -using namespace mlir; -using namespace bufferization; - -namespace { -/// Pass to bufferize Arith ops. -struct ArithBufferizePass - : public arith::impl::ArithBufferizePassBase { - using ArithBufferizePassBase::ArithBufferizePassBase; - - ArithBufferizePass(uint64_t alignment = 0, bool constantOpOnly = false) - : constantOpOnly(constantOpOnly) { - this->alignment = alignment; - } - - void runOnOperation() override { - BufferizationOptions options = getPartialBufferizationOptions(); - if (constantOpOnly) { - options.opFilter.allowOperation(); - } else { - options.opFilter.allowDialect(); - } - options.bufferAlignment = alignment; - - if (failed(bufferizeOp(getOperation(), options))) - signalPassFailure(); - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - arith::registerBufferizableOpInterfaceExternalModels(registry); - } - -private: - bool constantOpOnly; -}; -} // namespace - -std::unique_ptr -mlir::arith::createConstantBufferizePass(uint64_t alignment) { - return std::make_unique(alignment, - /*constantOpOnly=*/true); -} diff --git a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt index 12659eaba1fa5e..6b8bde8dc2aaf3 100644 --- a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt @@ -1,7 +1,6 @@ add_mlir_dialect_library(MLIRArithTransforms BufferDeallocationOpInterfaceImpl.cpp BufferizableOpInterfaceImpl.cpp - Bufferize.cpp BufferViewFlowOpInterfaceImpl.cpp EmulateUnsupportedFloats.cpp EmulateWideInt.cpp diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp index 7ba347a1f15e47..0fddd60eb8140e 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp @@ -320,29 +320,6 @@ struct OneShotBufferizePass }; } // namespace -namespace { -struct BufferizationBufferizePass - : public bufferization::impl::BufferizationBufferizeBase< - BufferizationBufferizePass> { - void runOnOperation() override { - BufferizationOptions options = getPartialBufferizationOptions(); - options.opFilter.allowDialect(); - - if (failed(bufferizeOp(getOperation(), options))) - signalPassFailure(); - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry - .insert(); - } -}; -} // namespace - -std::unique_ptr mlir::bufferization::createBufferizationBufferizePass() { - return std::make_unique(); -} - std::unique_ptr mlir::bufferization::createOneShotBufferizePass() { return std::make_unique(); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp deleted file mode 100644 index 8812ca14ba6109..00000000000000 --- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp +++ /dev/null @@ -1,52 +0,0 @@ -//===- Bufferize.cpp - Bufferization of linalg ops ------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Linalg/Passes.h" - -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/Linalg/Transforms/Transforms.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/IR/BuiltinDialect.h" -#include "mlir/IR/Operation.h" -#include "mlir/Pass/Pass.h" - -namespace mlir { -#define GEN_PASS_DEF_LINALGBUFFERIZEPASS -#include "mlir/Dialect/Linalg/Passes.h.inc" -} // namespace mlir - -using namespace mlir; -using namespace bufferization; - -namespace { -/// Converts Linalg operations that work on tensor-type operands or results to -/// work on buffers. -struct LinalgBufferizePass - : public impl::LinalgBufferizePassBase { - using impl::LinalgBufferizePassBase< - LinalgBufferizePass>::LinalgBufferizePassBase; - void runOnOperation() override { - BufferizationOptions options = getPartialBufferizationOptions(); - options.opFilter.allowDialect(); - - if (failed(bufferizeOp(getOperation(), options))) - signalPassFailure(); - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - linalg::registerBufferizableOpInterfaceExternalModels(registry); - } -}; -} // namespace diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt index ed9f40089282a6..7e3dc56e0acdc9 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt @@ -2,7 +2,6 @@ add_mlir_dialect_library(MLIRLinalgTransforms AllInterfaces.cpp BubbleUpExtractSlice.cpp BufferizableOpInterfaceImpl.cpp - Bufferize.cpp ConstantFold.cpp ConvertToDestinationStyle.cpp ConvertConv2DToImg2Col.cpp diff --git a/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp deleted file mode 100644 index 9dadbdbc91eca9..00000000000000 --- a/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp +++ /dev/null @@ -1,49 +0,0 @@ -//====----- Bufferize.cpp - Bufferization of shape ops ---------*- C++-*--===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Shape/Transforms/Passes.h" - -#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/Shape/IR/Shape.h" -#include "mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Pass/Pass.h" - -namespace mlir { -#define GEN_PASS_DEF_SHAPEBUFFERIZE -#include "mlir/Dialect/Shape/Transforms/Passes.h.inc" -} // namespace mlir - -using namespace mlir; -using namespace bufferization; - -namespace { -struct ShapeBufferizePass - : public impl::ShapeBufferizeBase { - void runOnOperation() override { - BufferizationOptions options = getPartialBufferizationOptions(); - options.opFilter.allowDialect(); - - if (failed(bufferizeOp(getOperation(), options))) - signalPassFailure(); - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - shape::registerBufferizableOpInterfaceExternalModels(registry); - } -}; -} // namespace - -std::unique_ptr> mlir::createShapeBufferizePass() { - return std::make_unique(); -} diff --git a/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt index 7c9b0d2e5e3a8e..a51c6780c28665 100644 --- a/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt @@ -1,6 +1,5 @@ add_mlir_dialect_library(MLIRShapeOpsTransforms BufferizableOpInterfaceImpl.cpp - Bufferize.cpp OutlineShapeComputation.cpp RemoveShapeConstraints.cpp ShapeToShapeLowering.cpp diff --git a/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp deleted file mode 100644 index d27c4576a8b7a9..00000000000000 --- a/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp +++ /dev/null @@ -1,58 +0,0 @@ -//===- Bufferize.cpp - Bufferization for `tensor` dialect ops -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements bufferization of `tensor` dialect ops -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/Tensor/Transforms/Passes.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" -#include "mlir/Transforms/DialectConversion.h" - -namespace mlir { -namespace tensor { -#define GEN_PASS_DEF_TENSORBUFFERIZE -#include "mlir/Dialect/Tensor/Transforms/Passes.h.inc" -} // namespace tensor -} // namespace mlir - -using namespace mlir; -using namespace bufferization; - -namespace { -struct TensorBufferizePass - : public tensor::impl::TensorBufferizeBase { - void runOnOperation() override { - BufferizationOptions options = getPartialBufferizationOptions(); - options.opFilter.allowDialect(); - - if (failed(bufferizeOp(getOperation(), options))) - signalPassFailure(); - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry - .insert(); - tensor::registerBufferizableOpInterfaceExternalModels(registry); - } -}; -} // namespace - -std::unique_ptr mlir::tensor::createTensorBufferizePass() { - return std::make_unique(); -} diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt index 0aabdaf667b9d8..ce32dea09bb0b5 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt @@ -1,6 +1,5 @@ add_mlir_dialect_library(MLIRTensorTransforms BufferizableOpInterfaceImpl.cpp - Bufferize.cpp ConcatOpPatterns.cpp EmptyOpPatterns.cpp ExtractSliceFromReshapeUtils.cpp diff --git a/mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp deleted file mode 100644 index ee99a99b561090..00000000000000 --- a/mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp +++ /dev/null @@ -1,55 +0,0 @@ -//===- Bufferize.cpp - Bufferization for `vector` dialect ops -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements bufferization of `vector` dialect ops -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" - -#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" -#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/Vector/Transforms/Passes.h" - -namespace mlir { -namespace vector { -#define GEN_PASS_DEF_VECTORBUFFERIZE -#include "mlir/Dialect/Vector/Transforms/Passes.h.inc" -} // namespace vector -} // namespace mlir - -using namespace mlir; -using namespace bufferization; - -namespace { -struct VectorBufferizePass - : public vector::impl::VectorBufferizeBase { - void runOnOperation() override { - BufferizationOptions options = getPartialBufferizationOptions(); - options.opFilter.allowDialect(); - - if (failed(bufferizeOp(getOperation(), options))) - signalPassFailure(); - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - vector::registerBufferizableOpInterfaceExternalModels(registry); - } -}; -} // namespace - -std::unique_ptr mlir::vector::createVectorBufferizePass() { - return std::make_unique(); -} diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt index c4b6abd3e23615..4dbefdd376a8b9 100644 --- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt @@ -1,6 +1,5 @@ add_mlir_dialect_library(MLIRVectorTransforms BufferizableOpInterfaceImpl.cpp - Bufferize.cpp LowerVectorBroadcast.cpp LowerVectorContract.cpp LowerVectorGather.cpp diff --git a/mlir/test/Dialect/Arith/bufferize.mlir b/mlir/test/Dialect/Arith/bufferize.mlir index 944954e9e4edd8..a3b1454fb68f66 100644 --- a/mlir/test/Dialect/Arith/bufferize.mlir +++ b/mlir/test/Dialect/Arith/bufferize.mlir @@ -1,5 +1,4 @@ -// RUN: mlir-opt %s -arith-bufferize -split-input-file -verify-diagnostics | FileCheck %s -// RUN: mlir-opt %s -arith-bufferize=alignment=64 -split-input-file -verify-diagnostics | FileCheck --check-prefix=ALIGNED %s +// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=arith,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -split-input-file -verify-diagnostics | FileCheck %s // CHECK-LABEL: func @index_cast( // CHECK-SAME: %[[TENSOR:.*]]: tensor, %[[SCALAR:.*]]: i32 @@ -22,10 +21,7 @@ func.func @index_cast(%tensor: tensor, %scalar: i32) -> (tensor, ind // The name isn't load-bearing though. // CHECK: memref.global "private" constant @__constant_3x4xf32 : memref<3x4xf32> = dense<7.000000e+00> -// CHECK-NOT: alignment - -// ALIGNED: memref.global "private" constant @__constant_3x4xf32 : memref<3x4xf32> = dense<7.000000e+00> -// ALIGNED-SAME: {alignment = 64 : i64} +// CHECK-SAME: {alignment = 64 : i64} // CHECK: @basic func.func @basic() -> tensor<3x4xf32> { diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir index 29f27e6838e661..e8ab1184b1fd26 100644 --- a/mlir/test/Dialect/Linalg/bufferize.mlir +++ b/mlir/test/Dialect/Linalg/bufferize.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -linalg-bufferize -canonicalize -cse -split-input-file %s | FileCheck %s +// RUN: mlir-opt --one-shot-bufferize="dialect-filter=linalg,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -canonicalize -cse -split-input-file %s | FileCheck %s #map0 = affine_map<(d0) -> (d0)> @@ -189,31 +189,3 @@ func.func @bufferize_dot(%in: tensor<4xf32>, %out: tensor) -> tensor { // CHECK: %[[OUT_TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] : memref // CHECK: return %[[OUT_TENSOR]] } - -// ----- - -// This is a regression test. The linalg-bufferize pass should ignore all func -// dialect ops. - -// CHECK-LABEL: func private @csum(tensor<6xi64>) -> tensor<6xi64> -func.func private @csum(%arg0: tensor<6xi64>) -> tensor<6xi64> - -// CHECK: func public @main(%[[arg0:.*]]: tensor<2x3xi1>) -// CHECK: %[[collapse:.*]] = tensor.collapse_shape %[[arg0]] -// CHECK: %[[collapse_m:.*]] = bufferization.to_memref %[[collapse]] -// CHECK: %[[alloc:.*]] = memref.alloc() -// CHECK: linalg.generic {{.*}} ins(%[[collapse_m]] : memref<6xi1>) outs(%[[alloc]] : memref<6xi64>) -// CHECK: %[[generic_t:.*]] = bufferization.to_tensor %[[alloc]] -// CHECK: %[[call:.*]] = call @csum(%[[generic_t]]) -// CHECK: return %[[call]] -func.func public @main(%arg0: tensor<2x3xi1>) -> tensor<6xi64> { - %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<2x3xi1> into tensor<6xi1> - %1 = tensor.empty() : tensor<6xi64> - %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<6xi1>) outs(%1 : tensor<6xi64>) { - ^bb0(%arg1: i1, %arg2: i64): - %4 = arith.extui %arg1 : i1 to i64 - linalg.yield %4 : i64 - } -> tensor<6xi64> - %3 = func.call @csum(%2) : (tensor<6xi64>) -> tensor<6xi64> - return %3 : tensor<6xi64> -} diff --git a/mlir/test/Dialect/Shape/bufferize.mlir b/mlir/test/Dialect/Shape/bufferize.mlir index 963a5e8bcf5787..9f30a052208f0b 100644 --- a/mlir/test/Dialect/Shape/bufferize.mlir +++ b/mlir/test/Dialect/Shape/bufferize.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -split-input-file -shape-bufferize <%s | FileCheck %s +// RUN: mlir-opt -split-input-file --one-shot-bufferize="dialect-filter=shape,bufferization copy-before-write unknown-type-conversion=identity-layout-map allow-unknown-ops" <%s | FileCheck %s // ----- diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir index 6112856fbf2931..c27df00785522a 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir @@ -4,8 +4,7 @@ // RUN: FileCheck %s --check-prefix=CHECK-MIR // // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification --sparse-tensor-conversion --cse \ -// RUN: --func-bufferize --arith-bufferize \ -// RUN: --tensor-bufferize --finalizing-bufferize | \ +// RUN: --one-shot-bufferize="copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" | \ // RUN: FileCheck %s --check-prefix=CHECK-LIR #CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}> diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir index 401da152a8bdb8..9fbb9dd0a26d17 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir @@ -4,8 +4,7 @@ // RUN: FileCheck %s --check-prefix=CHECK-MIR // // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification --sparse-tensor-conversion --cse \ -// RUN: --func-bufferize --arith-bufferize \ -// RUN: --tensor-bufferize --finalizing-bufferize | \ +// RUN: --one-shot-bufferize="copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" | \ // RUN: FileCheck %s --check-prefix=CHECK-LIR #CSC = #sparse_tensor.encoding<{ diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir index d769876d8ee8e3..a827360abb4267 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir @@ -4,8 +4,7 @@ // RUN: FileCheck %s --check-prefix=CHECK-MIR // // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification --sparse-tensor-conversion --cse \ -// RUN: --func-bufferize --arith-bufferize \ -// RUN: --tensor-bufferize --finalizing-bufferize | \ +// RUN: --one-shot-bufferize="copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" | \ // RUN: FileCheck %s --check-prefix=CHECK-LIR #CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}> diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir index 4f553adcc500fb..e85d9e740adf4e 100644 --- a/mlir/test/Dialect/Tensor/bufferize.mlir +++ b/mlir/test/Dialect/Tensor/bufferize.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -tensor-bufferize -cse -split-input-file | FileCheck %s +// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=tensor,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -cse -split-input-file | FileCheck %s // CHECK-LABEL: func @dim( // CHECK-SAME: %[[TENSOR:.*]]: tensor<*xf32>, diff --git a/mlir/test/Dialect/Vector/bufferize-invalid.mlir b/mlir/test/Dialect/Vector/bufferize-invalid.mlir index 1ae3e312c868f7..bcca50a0fe79a6 100644 --- a/mlir/test/Dialect/Vector/bufferize-invalid.mlir +++ b/mlir/test/Dialect/Vector/bufferize-invalid.mlir @@ -1,5 +1,4 @@ -// RUN: mlir-opt %s -vector-bufferize -split-input-file -verify-diagnostics -// | FileCheck %s +// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=vector,bufferization copy-before-write unknown-type-conversion=identity-layout-map allow-unknown-ops" -split-input-file -verify-diagnostics // CHECK-LABEL: func @mask( func.func @mask(%t0: tensor, %val: vector<16xf32>, %idx: index, %m0: vector<16xi1>) -> tensor { diff --git a/mlir/test/Dialect/Vector/bufferize.mlir b/mlir/test/Dialect/Vector/bufferize.mlir index 6a6a8fa8938bc2..3399f60a2c3bf3 100644 --- a/mlir/test/Dialect/Vector/bufferize.mlir +++ b/mlir/test/Dialect/Vector/bufferize.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -vector-bufferize -split-input-file | FileCheck %s +// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=vector,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -split-input-file | FileCheck %s // CHECK-LABEL: func @transfer_read( // CHECK-SAME: %[[t:.*]]: tensor, %[[o1:.*]]: index, %[[o2:.*]]: index, %[[pad:.*]]: f32) diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir index b0e414d157268b..5d27c3e290d50c 100644 --- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir +++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: -func-bufferize -tensor-bufferize -arith-bufferize --canonicalize \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \ // RUN: -convert-scf-to-cf --convert-complex-to-standard \ // RUN: -finalize-memref-to-llvm -convert-math-to-llvm -convert-math-to-libm \ // RUN: -convert-vector-to-llvm -convert-complex-to-llvm \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir index 43e423d4c3e8e1..734e09b7ed103d 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir @@ -1,10 +1,10 @@ -// RUN: mlir-opt %s -linalg-bufferize \ -// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \ +// RUN: mlir-opt %s \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \ // RUN: -convert-scf-to-cf -expand-strided-metadata -lower-affine -convert-cf-to-llvm -convert-arith-to-llvm \ // RUN: -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_runner_utils \ +// RUN: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \ // RUN: | FileCheck %s diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir index 84dad567ced3ff..a323b0d9f876cf 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s -convert-elementwise-to-linalg \ -// RUN: -arith-bufferize -linalg-bufferize -tensor-bufferize -func-bufferize \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ // RUN: -canonicalize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops \ // RUN: -convert-scf-to-cf -convert-arith-to-llvm -convert-cf-to-llvm --finalize-memref-to-llvm \ // RUN: -convert-func-to-llvm -reconcile-unrealized-casts | \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir index db882f7a54d392..45283e173c9f02 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir @@ -1,10 +1,10 @@ -// RUN: mlir-opt %s -linalg-bufferize \ -// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \ +// RUN: mlir-opt %s \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \ // RUN: -convert-scf-to-cf -expand-strided-metadata -lower-affine -convert-cf-to-llvm -convert-arith-to-llvm \ // RUN: -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_runner_utils \ +// RUN: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \ // RUN: | FileCheck %s diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir index 54a2bbf8d46809..23a07464bb5be9 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir @@ -1,6 +1,5 @@ // RUN: mlir-opt %s -test-linalg-transform-patterns=test-linalg-to-vector-patterns \ -// RUN: -empty-tensor-to-alloc-tensor -linalg-bufferize -arith-bufferize \ -// RUN: -bufferization-bufferize -tensor-bufferize -func-bufferize \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \ // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \ // RUN: -lower-affine -convert-arith-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir index 98fce6c020c03d..01a0ba26fd7cda 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -linalg-bufferize \ -// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \ +// RUN: mlir-opt %s \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \ // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \ // RUN: -lower-affine -convert-arith-to-llvm --finalize-memref-to-llvm \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir index cf7d0c762ea36f..73d4aff73fb7a4 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -linalg-bufferize \ -// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \ +// RUN: mlir-opt %s \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \ // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \ // RUN: -lower-affine -convert-arith-to-llvm --finalize-memref-to-llvm \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir index 38b49cd444df3c..ff9ddedf91e177 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir @@ -1,5 +1,6 @@ -// RUN: mlir-opt %s -arith-bufferize -linalg-bufferize \ -// RUN: -tensor-bufferize -func-bufferize -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops \ +// RUN: mlir-opt %s \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ +// RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops \ // RUN: -convert-arith-to-llvm -convert-scf-to-cf -convert-cf-to-llvm --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir index 41296cdfcb2d5a..698191577efe31 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir @@ -1,14 +1,14 @@ // UNSUPPORTED: asan -// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -linalg-bufferize -arith-bufferize \ -// RUN: -tensor-bufferize -func-bufferize -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops -convert-scf-to-cf \ +// RUN: mlir-opt %s -test-transform-dialect-erase-schedule \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ +// RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops -convert-scf-to-cf \ // RUN: -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -linalg-bufferize \ -// RUN: -scf-bufferize -arith-bufferize -tensor-bufferize \ -// RUN: -func-bufferize \ +// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ // RUN: -finalizing-bufferize -convert-linalg-to-loops -convert-scf-to-cf -convert-scf-to-cf \ // RUN: -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/Memref/print-memref.mlir b/mlir/test/Integration/Dialect/Memref/print-memref.mlir index b83f3919efd83e..f59e220d7461e6 100644 --- a/mlir/test/Integration/Dialect/Memref/print-memref.mlir +++ b/mlir/test/Integration/Dialect/Memref/print-memref.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: -func-bufferize -arith-bufferize --canonicalize \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \ // RUN: -finalize-memref-to-llvm\ // RUN: -convert-func-to-llvm -reconcile-unrealized-casts |\ // RUN: mlir-cpu-runner \ diff --git a/mlir/test/Integration/Dialect/Memref/verify-memref.mlir b/mlir/test/Integration/Dialect/Memref/verify-memref.mlir index b7e2a46688f475..431ae0a89d20c3 100644 --- a/mlir/test/Integration/Dialect/Memref/verify-memref.mlir +++ b/mlir/test/Integration/Dialect/Memref/verify-memref.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: -func-bufferize -arith-bufferize --canonicalize \ +// RUN: -func-bufferize -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \ // RUN: -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm\ // RUN: -convert-func-to-llvm -reconcile-unrealized-casts |\ // RUN: mlir-cpu-runner \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir index faa129efa63a91..a7c5b91273423b 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir @@ -1,5 +1,6 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-cf \ -// RUN: -arith-bufferize -convert-vector-to-llvm="enable-amx" \ +// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ +// RUN: -convert-scf-to-cf -convert-vector-to-llvm="enable-amx" \ // RUN: -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ // RUN: mlir-translate -mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="+amx-tile,+amx-int8,+amx-bf16" \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir index 3ed28fc68acb8f..7b7ee54db8c348 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir @@ -1,5 +1,7 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-cf \ -// RUN: -arith-bufferize -convert-vector-to-llvm="enable-amx" \ +// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-vector-to-llvm="enable-amx" \ // RUN: -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ // RUN: mlir-translate -mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="+amx-tile,+amx-int8,+amx-bf16" \ From a4bef0ca826a8145ef3cb288846017c034a817c2 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 28 May 2024 12:15:50 -0700 Subject: [PATCH 50/89] [libc++] Mark P2845R8 `__cpp_lib_format_path` and P2587R3 `__cpp_lib_to_string` as C++26 (#93255) [P2845R8](https://wg21.link/P2845R8) "Formatting of `std::filesystem::path`" and [P2587R3](https://wg21.link/P2587R3) "`to_string` or not `to_string`" are C++26 features, so they should be marked accordingly in `generate_feature_test_macro_components.py`. I verified that without my changes, running the script produced no edits. Then with my changes, I ran the script to regenerate all files, with no other manual edits. Found while running libc++'s tests with MSVC's STL, which noticed this because it's currently a C++23-only implementation. Note that @H-G-Hristov has a draft implementation of P2587R3: #78100 --- libcxx/docs/FeatureTestMacroTable.rst | 8 ++-- libcxx/include/version | 4 +- .../filesystem.version.compile.pass.cpp | 23 +++------- .../string.version.compile.pass.cpp | 23 +++------- .../version.version.compile.pass.cpp | 46 ++++++------------- .../generate_feature_test_macro_components.py | 4 +- 6 files changed, 36 insertions(+), 72 deletions(-) diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index 17d2da907692e8..0297068785e8b8 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -326,8 +326,6 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_expected`` ``202211L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_format_path`` *unimplemented* - ---------------------------------------------------------- ----------------- ``__cpp_lib_format_ranges`` ``202207L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_formatters`` *unimplemented* @@ -386,8 +384,6 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_string_resize_and_overwrite`` ``202110L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_to_string`` *unimplemented* - ---------------------------------------------------------- ----------------- ``__cpp_lib_to_underlying`` ``202102L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_tuple_like`` *unimplemented* @@ -412,6 +408,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_default_template_type_for_algorithm_values`` *unimplemented* ---------------------------------------------------------- ----------------- + ``__cpp_lib_format_path`` *unimplemented* + ---------------------------------------------------------- ----------------- ``__cpp_lib_freestanding_algorithm`` *unimplemented* ---------------------------------------------------------- ----------------- ``__cpp_lib_freestanding_array`` *unimplemented* @@ -466,6 +464,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_to_chars`` *unimplemented* ---------------------------------------------------------- ----------------- + ``__cpp_lib_to_string`` *unimplemented* + ---------------------------------------------------------- ----------------- ``__cpp_lib_tuple_like`` *unimplemented* ========================================================== ================= diff --git a/libcxx/include/version b/libcxx/include/version index 69556d731f1cfc..140a9a0d870360 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -459,7 +459,6 @@ __cpp_lib_void_t 201411L # define __cpp_lib_constexpr_typeinfo 202106L # define __cpp_lib_containers_ranges 202202L # define __cpp_lib_expected 202211L -// # define __cpp_lib_format_path 202403L # define __cpp_lib_format_ranges 202207L // # define __cpp_lib_formatters 202302L # define __cpp_lib_forward_like 202207L @@ -490,7 +489,6 @@ __cpp_lib_void_t 201411L # define __cpp_lib_stdatomic_h 202011L # define __cpp_lib_string_contains 202011L # define __cpp_lib_string_resize_and_overwrite 202110L -// # define __cpp_lib_to_string 202306L # define __cpp_lib_to_underlying 202102L // # define __cpp_lib_tuple_like 202207L # define __cpp_lib_unreachable 202202L @@ -506,6 +504,7 @@ __cpp_lib_void_t 201411L // # define __cpp_lib_copyable_function 202306L // # define __cpp_lib_debugging 202311L // # define __cpp_lib_default_template_type_for_algorithm_values 202403L +// # define __cpp_lib_format_path 202403L // # define __cpp_lib_freestanding_algorithm 202311L // # define __cpp_lib_freestanding_array 202311L // # define __cpp_lib_freestanding_cstring 202306L @@ -537,6 +536,7 @@ __cpp_lib_void_t 201411L // # define __cpp_lib_text_encoding 202306L # undef __cpp_lib_to_chars // # define __cpp_lib_to_chars 202306L +// # define __cpp_lib_to_string 202306L # undef __cpp_lib_tuple_like // # define __cpp_lib_tuple_like 202311L #endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp index 308cc2d43b0586..4aba33482f69c4 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp @@ -20,7 +20,7 @@ /* Constant Value __cpp_lib_char8_t 201907L [C++20] __cpp_lib_filesystem 201703L [C++17] - __cpp_lib_format_path 202403L [C++23] + __cpp_lib_format_path 202403L [C++26] */ #include @@ -37,7 +37,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER == 14 @@ -51,7 +51,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER == 17 @@ -74,7 +74,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER == 20 @@ -106,7 +106,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER == 23 @@ -137,17 +137,8 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_format_path -# error "__cpp_lib_format_path should be defined in c++23" -# endif -# if __cpp_lib_format_path != 202403L -# error "__cpp_lib_format_path should have the value 202403L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!" -# endif +# ifdef __cpp_lib_format_path +# error "__cpp_lib_format_path should not be defined before c++26" # endif #elif TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp index 16a9a0a28de635..af6386a40a458a 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp @@ -29,7 +29,7 @@ __cpp_lib_string_udls 201304L [C++14] __cpp_lib_string_view 201606L [C++17] 201803L [C++20] - __cpp_lib_to_string 202306L [C++23] + __cpp_lib_to_string 202306L [C++26] */ #include @@ -86,7 +86,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER == 14 @@ -143,7 +143,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER == 17 @@ -209,7 +209,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER == 20 @@ -293,7 +293,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER == 23 @@ -385,17 +385,8 @@ # error "__cpp_lib_string_view should have the value 201803L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_to_string -# error "__cpp_lib_to_string should be defined in c++23" -# endif -# if __cpp_lib_to_string != 202306L -# error "__cpp_lib_to_string should have the value 202306L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!" -# endif +# ifdef __cpp_lib_to_string +# error "__cpp_lib_to_string should not be defined before c++26" # endif #elif TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 7829e06f90760b..c1e1f9f340af48 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -88,7 +88,7 @@ __cpp_lib_expected 202211L [C++23] __cpp_lib_filesystem 201703L [C++17] __cpp_lib_format 202106L [C++20] - __cpp_lib_format_path 202403L [C++23] + __cpp_lib_format_path 202403L [C++26] __cpp_lib_format_ranges 202207L [C++23] __cpp_lib_format_uchar 202311L [C++20] __cpp_lib_formatters 202302L [C++23] @@ -216,7 +216,7 @@ __cpp_lib_to_array 201907L [C++20] __cpp_lib_to_chars 201611L [C++17] 202306L [C++26] - __cpp_lib_to_string 202306L [C++23] + __cpp_lib_to_string 202306L [C++26] __cpp_lib_to_underlying 202102L [C++23] __cpp_lib_transformation_trait_aliases 201304L [C++14] __cpp_lib_transparent_operators 201210L [C++14] @@ -513,7 +513,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifdef __cpp_lib_format_ranges @@ -1005,7 +1005,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifdef __cpp_lib_to_underlying @@ -1348,7 +1348,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifdef __cpp_lib_format_ranges @@ -1891,7 +1891,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifdef __cpp_lib_to_underlying @@ -2303,7 +2303,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifdef __cpp_lib_format_ranges @@ -2972,7 +2972,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifdef __cpp_lib_to_underlying @@ -3543,7 +3543,7 @@ # endif # ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined before c++23" +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifdef __cpp_lib_format_ranges @@ -4350,7 +4350,7 @@ # endif # ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined before c++23" +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifdef __cpp_lib_to_underlying @@ -4971,17 +4971,8 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_format_path -# error "__cpp_lib_format_path should be defined in c++23" -# endif -# if __cpp_lib_format_path != 202403L -# error "__cpp_lib_format_path should have the value 202403L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_format_path -# error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!" -# endif +# ifdef __cpp_lib_format_path +# error "__cpp_lib_format_path should not be defined before c++26" # endif # ifndef __cpp_lib_format_ranges @@ -5943,17 +5934,8 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_to_string -# error "__cpp_lib_to_string should be defined in c++23" -# endif -# if __cpp_lib_to_string != 202306L -# error "__cpp_lib_to_string should have the value 202306L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_to_string -# error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!" -# endif +# ifdef __cpp_lib_to_string +# error "__cpp_lib_to_string should not be defined before c++26" # endif # ifndef __cpp_lib_to_underlying diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index b04cb4f5115547..1e79f6c140758c 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -515,7 +515,7 @@ def add_version_header(tc): }, { "name": "__cpp_lib_format_path", - "values": {"c++23": 202403}, # P2845R8: Formatting of std::filesystem::path + "values": {"c++26": 202403}, # P2845R8: Formatting of std::filesystem::path "headers": ["filesystem"], "unimplemented": True, }, @@ -1270,7 +1270,7 @@ def add_version_header(tc): }, { "name": "__cpp_lib_to_string", - "values": {"c++23": 202306}, # P2587R3 to_string or not to_string + "values": {"c++26": 202306}, # P2587R3 to_string or not to_string "headers": ["string"], "unimplemented": True, }, From 51752ed0dd737f12014a89dec67d25494083153d Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Tue, 28 May 2024 21:17:31 +0200 Subject: [PATCH 51/89] [mlir][nvgpu] verify the module --- mlir/test/Examples/NVGPU/tools/nvdsl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/test/Examples/NVGPU/tools/nvdsl.py b/mlir/test/Examples/NVGPU/tools/nvdsl.py index 600cae5b47eeec..90dbb2355e1c87 100644 --- a/mlir/test/Examples/NVGPU/tools/nvdsl.py +++ b/mlir/test/Examples/NVGPU/tools/nvdsl.py @@ -431,7 +431,7 @@ def __str__(self): # saveIR(module) # Verify the module - # module.operation.verify() + module.operation.verify() # Compile and JIT MLIR module options = f"cubin-chip=sm_90a cubin-features=+ptx80 opt-level=3" From 266fac8375bdf3f039503c559bb16ffab8895ae5 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 28 May 2024 12:17:57 -0700 Subject: [PATCH 52/89] [libc++] [test] Fix MSVC warnings (#93257) Found while running libc++'s tests with MSVC's STL. * Avoid MSVC warning C5101: use of preprocessor directive in function-like macro argument list is undefined behavior. + We can easily make this portable by extracting `const bool is_newlib`. + Followup to #73440. + See #73598. + See #73836. * Avoid MSVC warning C4267: 'return': conversion from 'size_t' to 'int', possible loss of data. + This warning is valid, but harmless for the test, so `static_cast` will avoid it. * Avoid MSVC warning C4146: unary minus operator applied to unsigned type, result still unsigned. + This warning is also valid (the scenario is sometimes intentional, but surprising enough that it's worth warning about). This is a C++17 test, so we can easily avoid it by testing `is_signed_v` at compile-time before testing `m < 0` and `n < 0` at run-time. * Silence MSVC warning C4310: cast truncates constant value. + These warnings are being emitted by `T(255)`. Disabling the warning is simpler than attempting to restructure the code. + Followup to #79791. * MSVC no longer emits warning C4521: multiple copy constructors specified. + This warning was removed from the compiler, since at least 2021-12-09. --- .../atomics.ref/compare_exchange_strong.pass.cpp | 3 +++ .../atomics.ref/compare_exchange_weak.pass.cpp | 3 +++ libcxx/test/std/atomics/atomics.ref/wait.pass.cpp | 3 +++ .../views.span/span.cons/initializer_list.pass.cpp | 4 ++-- .../syserr.errcat.objects/generic_category.pass.cpp | 11 +++++++---- .../syserr.errcat.objects/system_category.pass.cpp | 11 +++++++---- .../numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp | 10 ++++++---- libcxx/test/support/msvc_stdlib_force_include.h | 1 - 8 files changed, 31 insertions(+), 15 deletions(-) diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp index 72b2f444c476c7..90aa5ea5b6df45 100644 --- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp +++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp @@ -9,6 +9,9 @@ // XFAIL: !has-64-bit-atomics // XFAIL: !has-1024-bit-atomics +// MSVC warning C4310: cast truncates constant value +// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310 + // bool compare_exchange_strong(T&, T, memory_order, memory_order) const noexcept; // bool compare_exchange_strong(T&, T, memory_order = memory_order::seq_cst) const noexcept; diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp index 5219a8e3714f98..99c1385a2fe0b7 100644 --- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp +++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp @@ -9,6 +9,9 @@ // XFAIL: !has-64-bit-atomics // XFAIL: !has-1024-bit-atomics +// MSVC warning C4310: cast truncates constant value +// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310 + // bool compare_exchange_weak(T&, T, memory_order, memory_order) const noexcept; // bool compare_exchange_weak(T&, T, memory_order = memory_order::seq_cst) const noexcept; diff --git a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp index e5310febf5c5eb..f246803ba25925 100644 --- a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp +++ b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp @@ -11,6 +11,9 @@ // XFAIL: !has-64-bit-atomics // XFAIL: !has-1024-bit-atomics +// MSVC warning C4310: cast truncates constant value +// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310 + // void wait(T, memory_order = memory_order::seq_cst) const noexcept; #include diff --git a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp index 74a5094f61261d..bc76e23fea3c03 100644 --- a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp +++ b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp @@ -93,9 +93,9 @@ constexpr bool test() { // Test P2447R4 "Annex C examples" -constexpr int three(std::span sp) { return sp.size(); } +constexpr int three(std::span sp) { return static_cast(sp.size()); } -constexpr int four(std::span sp) { return sp.size(); } +constexpr int four(std::span sp) { return static_cast(sp.size()); } bool test_P2447R4_annex_c_examples() { // 1. Overload resolution is affected diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp index d4bbde75ae8821..7283fdc769d86b 100644 --- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp +++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp @@ -50,13 +50,16 @@ int main(int, char**) // responds with an empty message, which we probably want to // treat as a failure code otherwise, but we can detect that // with the preprocessor. +#if defined(_NEWLIB_VERSION) + const bool is_newlib = true; +#else + const bool is_newlib = false; +#endif + (void)is_newlib; LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0 // AIX || msg.rfind("No error information", 0) == 0 // Musl || msg.rfind("Unknown error", 0) == 0 // Glibc -#if defined(_NEWLIB_VERSION) - || msg.empty() -#endif - ); + || (is_newlib && msg.empty())); assert(errno == E2BIG); } diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp index eefbddd27a7f53..02a1baf5999831 100644 --- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp +++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp @@ -56,13 +56,16 @@ int main(int, char**) { // responds with an empty message, which we probably want to // treat as a failure code otherwise, but we can detect that // with the preprocessor. +#if defined(_NEWLIB_VERSION) + const bool is_newlib = true; +#else + const bool is_newlib = false; +#endif + (void)is_newlib; LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0 // AIX || msg.rfind("No error information", 0) == 0 // Musl || msg.rfind("Unknown error", 0) == 0 // Glibc -#if defined(_NEWLIB_VERSION) - || msg.empty() -#endif - ); + || (is_newlib && msg.empty())); assert(errno == E2BIG); } diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp index 212804356a056d..bf40b174b209cc 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp @@ -57,10 +57,12 @@ T basic_gcd_(T m, T n) { template T basic_gcd(T m, T n) { using Tp = std::make_unsigned_t; - if (m < 0 && m != std::numeric_limits::min()) - m = -m; - if (n < 0 && n != std::numeric_limits::min()) - n = -n; + if constexpr (std::is_signed_v) { + if (m < 0 && m != std::numeric_limits::min()) + m = -m; + if (n < 0 && n != std::numeric_limits::min()) + n = -n; + } return basic_gcd_(static_cast(m), static_cast(n)); } diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h index 6c26085e72c45f..35783c1607b0e0 100644 --- a/libcxx/test/support/msvc_stdlib_force_include.h +++ b/libcxx/test/support/msvc_stdlib_force_include.h @@ -67,7 +67,6 @@ const AssertionDialogAvoider assertion_dialog_avoider{}; // Silence compiler warnings. # pragma warning(disable : 4180) // qualifier applied to function type has no meaning; ignored # pragma warning(disable : 4324) // structure was padded due to alignment specifier -# pragma warning(disable : 4521) // multiple copy constructors specified # pragma warning(disable : 4702) // unreachable code # pragma warning(disable : 28251) // Inconsistent annotation for 'new': this instance has no annotations. #endif // !defined(__clang__) From 2ba08386156ef25913b1bee170d8fe95aaceb234 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 28 May 2024 12:20:58 -0700 Subject: [PATCH 53/89] [libc++] [test] Fix portability issues for MSVC (#93259) * Guard `std::__make_from_tuple_impl` tests with `#ifdef _LIBCPP_VERSION` and `LIBCPP_STATIC_ASSERT`. * Change `_LIBCPP_CONSTEXPR_SINCE_CXX20` to `TEST_CONSTEXPR_CXX20`. + Other functions in `variant.swap/swap.pass.cpp` were already using the proper test macro. * Mark `what` as `[[maybe_unused]]` when used by `TEST_LIBCPP_REQUIRE`. + This updates one occurrence in `libcxx/test/libcxx` for consistency. * Windows `_putenv_s()` takes 2 arguments, not 3. + See MSVC documentation: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/putenv-s-wputenv-s?view=msvc-170 + POSIX `setenv()` takes `int overwrite`, but Windows `_putenv_s()` always overwrites. * Avoid non-Standard zero-length arrays. + Followup to #74183 and #79792. * Add `operator++()` to `unsized_it`. + The Standard requires this due to [N4981][] [move.iter.requirements]/1 "The template parameter `Iterator` shall either meet the *Cpp17InputIterator* requirements ([input.iterators]) or model `input_iterator` ([iterator.concept.input])." + MSVC's STL requires this because it has a strengthened exception specification in `move_iterator` that inspects the underlying iterator's increment operator. * `uniform_int_distribution` forbids `int8_t`/`uint8_t`. + See [N4981][] [rand.req.genl]/1.5. MSVC's STL enforces this. + Note that when changing the distribution's `IntType`, we need to be careful to preserve the original value range of `[0, max_input]`. * fstreams are constructible from `const fs::path::value_type*` on wide systems. + See [ifstream.cons], [ofstream.cons], [fstream.cons]. * In `msvc_stdlib_force_include.h`, map `_HAS_CXX23` to `TEST_STD_VER` 23 instead of 99. + On 2023-05-23, https://github.com/llvm/llvm-project/commit/71400505ca048507e827013eb1ea0bc863525cab started recognizing 23 as a distinct value. * Fix test name typo: `destory_elements.pass.cpp` => `destroy_elements.pass.cpp` [N4981]: https://wg21.link/N4981 --- .../time.zone.db.tzdb/locate_zone.pass.cpp | 2 +- .../ranges.contains_subrange.pass.cpp | 25 +++++++++-------- ...nts.pass.cpp => destroy_elements.pass.cpp} | 0 .../fstreams/fstream.cons/path.pass.cpp | 2 +- .../fstreams/ifstream.cons/path.pass.cpp | 2 +- .../fstreams/ofstream.cons/path.pass.cpp | 2 +- .../sized_sentinel.compile.pass.cpp | 1 + .../numeric.ops/numeric.ops.gcd/gcd.pass.cpp | 9 ++++-- .../time.zone.db.access/current_zone.pass.cpp | 2 +- .../time.zone.db.access/locate_zone.pass.cpp | 2 +- .../time.zone.db.tzdb/current_zone.pass.cpp | 2 +- .../time.zone.db.tzdb/locate_zone.pass.cpp | 2 +- .../tuple.apply/make_from_tuple.pass.cpp | 28 ++++++++++--------- .../variant.swap/swap.pass.cpp | 2 +- .../test/support/msvc_stdlib_force_include.h | 2 +- 15 files changed, 45 insertions(+), 38 deletions(-) rename libcxx/test/std/containers/sequences/vector/vector.modifiers/{destory_elements.pass.cpp => destroy_elements.pass.cpp} (100%) diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp index 3ee213358f3524..08c682964c3745 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp @@ -73,7 +73,7 @@ L link link_to_link TEST_VALIDATE_EXCEPTION( std::runtime_error, [&]([[maybe_unused]] const std::runtime_error& e) { - std::string_view what{"tzdb: requested time zone not found"}; + [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"}; TEST_LIBCPP_REQUIRE( e.what() == what, TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception ", e.what(), '\n')); diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp index 761691c2afdcb9..890ac23fff8327 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp @@ -24,6 +24,7 @@ // Proj1 proj1 = {}, Proj2 proj2 = {}); // since C++23 #include +#include #include #include #include @@ -130,10 +131,10 @@ constexpr void test_iterators() { } { // range has zero length - int a[] = {}; - int p[] = {3, 4, 2}; - auto whole = std::ranges::subrange(Iter1(a), Sent1(Iter1(a))); - auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p)))); + std::array a = {}; + int p[] = {3, 4, 2}; + auto whole = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data()))); + auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p)))); { bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end()); assert(!ret); @@ -145,10 +146,10 @@ constexpr void test_iterators() { } { // subrange has zero length - int a[] = {3, 4, 2}; - int p[] = {}; - auto whole = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a)))); - auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p))); + int a[] = {3, 4, 2}; + std::array p = {}; + auto whole = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a)))); + auto subrange = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data()))); { bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end()); assert(ret); @@ -160,10 +161,10 @@ constexpr void test_iterators() { } { // range and subrange both have zero length - int a[] = {}; - int p[] = {}; - auto whole = std::ranges::subrange(Iter1(a), Sent1(Iter1(a))); - auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p))); + std::array a = {}; + std::array p = {}; + auto whole = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data()))); + auto subrange = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data()))); { bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end()); assert(ret); diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp similarity index 100% rename from libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp rename to libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp index 5edf22eaacf31f..d6bb56d9b78b79 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp @@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() { static_assert(!std::is_constructible_v>); // Char* pointers - if constexpr (!std::is_same_v) + if constexpr (!std::is_same_v && !std::is_same_v) static_assert(!std::is_constructible_v); // Iterators diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp index 2f27fd8e6e93d3..792b65615679a7 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp @@ -38,7 +38,7 @@ constexpr bool test_non_convert_to_path() { static_assert(!std::is_constructible_v>); // Char* pointers - if constexpr (!std::is_same_v) + if constexpr (!std::is_same_v && !std::is_same_v) static_assert(!std::is_constructible_v); // Iterators diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp index e55adfd83fc3c7..602bdadd85813f 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp @@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() { static_assert(!std::is_constructible_v>); // Char* pointers - if constexpr (!std::is_same_v) + if constexpr (!std::is_same_v && !std::is_same_v) static_assert(!std::is_constructible_v); // Iterators diff --git a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp index cb49086dd6802b..998b13ed494552 100644 --- a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp @@ -21,6 +21,7 @@ struct unsized_it { using difference_type = std::ptrdiff_t; value_type& operator*() const; + unsized_it& operator++(); bool operator==(const unsized_it&) const; difference_type operator-(const unsized_it&) const { return 0; } }; diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp index bf40b174b209cc..6a9ec1a2ffec24 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -69,12 +70,14 @@ T basic_gcd(T m, T n) { template void do_fuzzy_tests() { std::mt19937 gen(1938); - std::uniform_int_distribution distrib; + using DistIntType = std::conditional_t; // See N4981 [rand.req.genl]/1.5 + constexpr Input max_input = std::numeric_limits::max(); + std::uniform_int_distribution distrib(0, max_input); constexpr int nb_rounds = 10000; for (int i = 0; i < nb_rounds; ++i) { - Input n = distrib(gen); - Input m = distrib(gen); + Input n = static_cast(distrib(gen)); + Input m = static_cast(distrib(gen)); assert(std::gcd(n, m) == basic_gcd(n, m)); } } diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp index 2c43e121613c77..f31a679dd6214f 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp @@ -32,7 +32,7 @@ static void set_tz(std::string zone) { // Unlike POSIX it does not mention the string of putenv becomes part // of the environment. - int status = _putenv_s("TZ", zone.c_str(), 1); + int status = _putenv_s("TZ", zone.c_str()); assert(status == 0); } diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp index 4d600fcdf40e3f..8dd895fd21814f 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp @@ -40,7 +40,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) { TEST_VALIDATE_EXCEPTION( std::runtime_error, [&]([[maybe_unused]] const std::runtime_error& e) { - std::string_view what{"tzdb: requested time zone not found"}; + [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"}; TEST_LIBCPP_REQUIRE( e.what() == what, TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception ", e.what(), '\n')); diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp index e6497e26323ce6..98509c298ebcb8 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp @@ -34,7 +34,7 @@ static void set_tz(std::string zone) { // Unlike POSIX it does not mention the string of putenv becomes part // of the environment. - int status = _putenv_s("TZ", zone.c_str(), 1); + int status = _putenv_s("TZ", zone.c_str()); assert(status == 0); } diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp index f929dafcc96838..08ce48dfd0edb2 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp @@ -42,7 +42,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) { TEST_VALIDATE_EXCEPTION( std::runtime_error, [&]([[maybe_unused]] const std::runtime_error& e) { - std::string_view what{"tzdb: requested time zone not found"}; + [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"}; TEST_LIBCPP_REQUIRE( e.what() == what, TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception ", e.what(), '\n')); diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp index d7374351afa8bf..accb601dd00365 100644 --- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp +++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp @@ -209,6 +209,7 @@ template static constexpr bool can_make_from_tuple = std::is_same_v(T{}, Tuple{})), uint8_t>; +#ifdef _LIBCPP_VERSION template auto test_make_from_tuple_impl(T&&, Tuple&& t) -> decltype(std::__make_from_tuple_impl( @@ -224,6 +225,7 @@ uint32_t test_make_from_tuple_impl(...) { template static constexpr bool can_make_from_tuple_impl = std::is_same_v(T{}, Tuple{})), uint8_t>; +#endif // _LIBCPP_VERSION struct A { int a; @@ -263,23 +265,23 @@ static_assert(can_make_from_tuple>); // Test std::__make_from_tuple_impl constraints. // reinterpret_cast -static_assert(!can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); // const_cast -static_assert(!can_make_from_tuple_impl>); -static_assert(!can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); // static_cast -static_assert(!can_make_from_tuple_impl>); -static_assert(!can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); -static_assert(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); +LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl>); } // namespace LWG3528 diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp index db05691c55818c..039a2373348c4e 100644 --- a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp +++ b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp @@ -516,7 +516,7 @@ constexpr void test_swap_sfinae() { } } -_LIBCPP_CONSTEXPR_SINCE_CXX20 void test_swap_noexcept() { +TEST_CONSTEXPR_CXX20 void test_swap_noexcept() { { using V = std::variant; static_assert(std::is_swappable_v && has_swap_member(), ""); diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h index 35783c1607b0e0..785670224c3b18 100644 --- a/libcxx/test/support/msvc_stdlib_force_include.h +++ b/libcxx/test/support/msvc_stdlib_force_include.h @@ -90,7 +90,7 @@ const AssertionDialogAvoider assertion_dialog_avoider{}; #include #if _HAS_CXX23 -# define TEST_STD_VER 99 +# define TEST_STD_VER 23 #elif _HAS_CXX20 # define TEST_STD_VER 20 #elif _HAS_CXX17 From bc247ba113543b07fcff769ab616cf9509eb2794 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 May 2024 12:42:31 -0700 Subject: [PATCH 54/89] [memprof] Rename memprof-merge-v0.test to memprof-merge-versions.test (#93602) Despite the name, the test is used to test merge/show roundtrips for different MemProf versions. This patch renames the test to match the reality. --- .../{memprof-merge-v0.test => memprof-merge-versions.test} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llvm/test/tools/llvm-profdata/{memprof-merge-v0.test => memprof-merge-versions.test} (100%) diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-v0.test b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test similarity index 100% rename from llvm/test/tools/llvm-profdata/memprof-merge-v0.test rename to llvm/test/tools/llvm-profdata/memprof-merge-versions.test From 1c3a3f0e79a9c6a7c1c4a71c43a9eab783c3b266 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 12:49:42 -0700 Subject: [PATCH 55/89] [LegalizeTypes] Use VP_AND and VP_SHL/VP_SRA to promote operands fo VP arithmetic. (#92799) This adds VPSExtPromotedInteger and VPZExtPromotedInteger and uses them to promote many arithmetic operations. VPSExtPromotedInteger uses a shift pair because we don't have VP_SIGN_EXTEND_INREG yet. --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 113 ++++++++++++------ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 21 ++++ llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 12 +- llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll | 6 +- .../RISCV/rvv/fixed-vectors-vdiv-vp.ll | 9 +- .../RISCV/rvv/fixed-vectors-vdivu-vp.ll | 5 +- .../RISCV/rvv/fixed-vectors-vmax-vp.ll | 9 +- .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll | 5 +- .../RISCV/rvv/fixed-vectors-vmin-vp.ll | 9 +- .../RISCV/rvv/fixed-vectors-vminu-vp.ll | 5 +- .../RISCV/rvv/fixed-vectors-vrem-vp.ll | 9 +- .../RISCV/rvv/fixed-vectors-vremu-vp.ll | 5 +- .../RISCV/rvv/fixed-vectors-vshl-vp.ll | 3 +- .../RISCV/rvv/fixed-vectors-vsra-vp.ll | 7 +- .../RISCV/rvv/fixed-vectors-vsrl-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll | 40 +++---- llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll | 5 +- llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll | 2 +- llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll | 7 +- llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll | 5 +- 27 files changed, 201 insertions(+), 136 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 8fda35f0086329..12f1d005249d60 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -646,18 +646,21 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) { } } - // Zero extend to the promoted type and do the count there. - SDValue Op = ZExtPromotedInteger(N->getOperand(0)); - // Subtract off the extra leading bits in the bigger type. SDValue ExtractLeadingBits = DAG.getConstant( NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT); - if (!N->isVPOpcode()) + if (!N->isVPOpcode()) { + // Zero extend to the promoted type and do the count there. + SDValue Op = ZExtPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::SUB, dl, NVT, DAG.getNode(N->getOpcode(), dl, NVT, Op), ExtractLeadingBits); + } + SDValue Mask = N->getOperand(1); SDValue EVL = N->getOperand(2); + // Zero extend to the promoted type and do the count there. + SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL); return DAG.getNode(ISD::VP_SUB, dl, NVT, DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL), ExtractLeadingBits, Mask, EVL); @@ -681,11 +684,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) { } // Zero extend to the promoted type and do the count or parity there. - SDValue Op = ZExtPromotedInteger(N->getOperand(0)); - if (!N->isVPOpcode()) + if (!N->isVPOpcode()) { + SDValue Op = ZExtPromotedInteger(N->getOperand(0)); return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op); - return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, - N->getOperand(1), N->getOperand(2)); + } + + SDValue Mask = N->getOperand(1); + SDValue EVL = N->getOperand(2); + SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL); + return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, Mask, + EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) { @@ -1335,12 +1343,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FFREXP(SDNode *N) { SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); - if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) - RHS = ZExtPromotedInteger(RHS); - if (N->getOpcode() != ISD::VP_SHL) + if (N->getOpcode() != ISD::VP_SHL) { + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = ZExtPromotedInteger(RHS); + return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } + + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = VPZExtPromotedInteger(RHS, Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) { @@ -1364,27 +1379,39 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) { } SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) { - // Sign extend the input. - SDValue LHS = SExtPromotedInteger(N->getOperand(0)); - SDValue RHS = SExtPromotedInteger(N->getOperand(1)); - if (N->getNumOperands() == 2) + if (N->getNumOperands() == 2) { + // Sign extend the input. + SDValue LHS = SExtPromotedInteger(N->getOperand(0)); + SDValue RHS = SExtPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } assert(N->getNumOperands() == 4 && "Unexpected number of operands!"); assert(N->isVPOpcode() && "Expected VP opcode"); + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + // Sign extend the input. + SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL); + SDValue RHS = VPSExtPromotedInteger(N->getOperand(1), Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) { - // Zero extend the input. - SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); - SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); - if (N->getNumOperands() == 2) + if (N->getNumOperands() == 2) { + // Zero extend the input. + SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); + SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } assert(N->getNumOperands() == 4 && "Unexpected number of operands!"); assert(N->isVPOpcode() && "Expected VP opcode"); + // Zero extend the input. + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL); + SDValue RHS = VPZExtPromotedInteger(N->getOperand(1), Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) { @@ -1400,27 +1427,43 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) { } SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) { - // The input value must be properly sign extended. - SDValue LHS = SExtPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); - if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) - RHS = ZExtPromotedInteger(RHS); - if (N->getOpcode() != ISD::VP_SRA) + if (N->getOpcode() != ISD::VP_SRA) { + // The input value must be properly sign extended. + SDValue LHS = SExtPromotedInteger(N->getOperand(0)); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = ZExtPromotedInteger(RHS); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } + + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + // The input value must be properly sign extended. + SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = VPZExtPromotedInteger(RHS, Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) { - // The input value must be properly zero extended. - SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); - if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) - RHS = ZExtPromotedInteger(RHS); - if (N->getOpcode() != ISD::VP_SRL) + if (N->getOpcode() != ISD::VP_SRL) { + // The input value must be properly zero extended. + SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = ZExtPromotedInteger(RHS); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + } + + SDValue Mask = N->getOperand(2); + SDValue EVL = N->getOperand(3); + // The input value must be properly zero extended. + SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL); + if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) + RHS = VPZExtPromotedInteger(RHS, Mask, EVL); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, - N->getOperand(2), N->getOperand(3)); + Mask, EVL); } SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) { @@ -1487,7 +1530,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VPFunnelShift(SDNode *N) { SDValue Mask = N->getOperand(3); SDValue EVL = N->getOperand(4); if (getTypeAction(Amt.getValueType()) == TargetLowering::TypePromoteInteger) - Amt = ZExtPromotedInteger(Amt); + Amt = VPZExtPromotedInteger(Amt, Mask, EVL); EVT AmtVT = Amt.getValueType(); SDLoc DL(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index d925089d5689f1..ba3c7582d5a8a2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -275,6 +275,27 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { return DAG.getZeroExtendInReg(Op, dl, OldVT); } + /// Get a promoted operand and zero extend it to the final size. + SDValue VPSExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) { + EVT OldVT = Op.getValueType(); + SDLoc dl(Op); + Op = GetPromotedInteger(Op); + // FIXME: Add VP_SIGN_EXTEND_INREG. + EVT VT = Op.getValueType(); + unsigned BitsDiff = VT.getScalarSizeInBits() - OldVT.getScalarSizeInBits(); + SDValue ShiftCst = DAG.getShiftAmountConstant(BitsDiff, VT, dl); + SDValue Shl = DAG.getNode(ISD::VP_SHL, dl, VT, Op, ShiftCst, Mask, EVL); + return DAG.getNode(ISD::VP_SRA, dl, VT, Shl, ShiftCst, Mask, EVL); + } + + /// Get a promoted operand and zero extend it to the final size. + SDValue VPZExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) { + EVT OldVT = Op.getValueType(); + SDLoc dl(Op); + Op = GetPromotedInteger(Op); + return DAG.getVPZeroExtendInReg(Op, Mask, EVL, dl, OldVT); + } + // Promote the given operand V (vector or scalar) according to N's specific // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll index fff280c005b542..df413b878172bd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll @@ -2574,9 +2574,8 @@ define @vp_ctlz_nxv1i9( %va, @vp_ctlz_nxv1i9( %va, @vp_ctlz_zero_undef_nxv1i9( %va, @vp_ctlz_zero_undef_nxv1i9( %va, @vp_ctpop_nxv1i9( %va, @vp_ctpop_nxv1i9( %va, @llvm.vp.ctpop.nxv1i9( %va, %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll index 29f8eaba900527..e3c7d02462cc7f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll @@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) define <8 x i7> @vdiv_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vdiv_vv_v8i7: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vdiv.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll index 3f8eb0ff276b7f..03bd85bf5e69e2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vdivu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK-LABEL: vdivu_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vdivu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.udiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll index 9789afda9344ad..0b0d758ad8ded8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll @@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smax.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) define <8 x i7> @vmax_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmax_vv_v8i7: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.smax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll index 36b0a4642b6169..98e630a0e59e5a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vmaxu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK-LABEL: vmaxu_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vmaxu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.umax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll index adb0a30f34d35a..a6e3764b37550d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll @@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smin.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) define <8 x i7> @vmin_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmin_vv_v8i7: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vmin.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.smin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll index 671ce82d4ae795..c59b65edd1ec10 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vminu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK-LABEL: vminu_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vminu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.umin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll index 4bbbad5ed0e0e8..ff8a63e371c8ef 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll @@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.srem.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) define <8 x i7> @vrem_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vrem_vv_v8i7: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vrem.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.srem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll index ee11307bddc88c..b5eec4142c7824 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vremu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK-LABEL: vremu_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vremu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.urem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll index c4b7c1f2f19f0f..16a0fddfa98277 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll @@ -10,9 +10,8 @@ define <8 x i7> @vsll_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex ; CHECK-LABEL: vsll_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.shl.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll index 7ea5b1f0b505a3..180fafa9659b1c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll @@ -10,11 +10,10 @@ define <8 x i7> @vsra_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex ; CHECK-LABEL: vsra_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsra.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.ashr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll index 9f9d4af0cc2f3f..22f04803eadd74 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll @@ -10,10 +10,9 @@ define <8 x i7> @vsrl_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex ; CHECK-LABEL: vsrl_vv_v8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 127 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i7> @llvm.vp.lshr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll index bc5617957d7d08..2c5a3dfffc2cfc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll @@ -1282,18 +1282,17 @@ define @fshr_v1i9( %a, %b, ; CHECK-LABEL: fshr_v1i9: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 511 -; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vsll.vi v9, v9, 7, v0.t +; CHECK-NEXT: vand.vx v10, v10, a1, v0.t ; CHECK-NEXT: li a0, 9 ; CHECK-NEXT: vremu.vx v10, v10, a0, v0.t ; CHECK-NEXT: vadd.vi v10, v10, 7, v0.t ; CHECK-NEXT: vand.vi v11, v10, 15, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 7, v0.t ; CHECK-NEXT: vsrl.vv v9, v9, v11, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vi v10, v10, 15, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -1306,18 +1305,17 @@ define @fshl_v1i9( %a, %b, ; CHECK-LABEL: fshl_v1i9: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 511 -; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vsll.vi v9, v9, 7, v0.t -; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t +; CHECK-NEXT: vand.vx v10, v10, a1, v0.t ; CHECK-NEXT: li a0, 9 ; CHECK-NEXT: vremu.vx v10, v10, a0, v0.t -; CHECK-NEXT: vnot.v v11, v10, v0.t -; CHECK-NEXT: vand.vi v11, v11, 15, v0.t -; CHECK-NEXT: vsrl.vv v9, v9, v11, v0.t +; CHECK-NEXT: vand.vi v11, v10, 15, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v11, v0.t +; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vi v10, v10, 15, v0.t -; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 7, v0.t +; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vv v9, v9, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %res = call @llvm.vp.fshl.nxv1i9( %a, %b, %c, %m, i32 %evl) @@ -1330,15 +1328,14 @@ declare @llvm.vp.fshr.nxv1i4(, @fshr_v1i4( %a, %b, %c, %m, i32 zeroext %evl) { ; CHECK-LABEL: fshr_v1i4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vand.vi v10, v10, 15 -; CHECK-NEXT: li a1, 4 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vremu.vx v10, v10, a1, v0.t +; CHECK-NEXT: vand.vi v10, v10, 15, v0.t ; CHECK-NEXT: vand.vi v9, v9, 15, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vv v8, v8, v10, v0.t +; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vremu.vx v9, v10, a0, v0.t +; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: ret %trunca = call @llvm.vp.trunc.nxv1i4.nxv1i8( %a, %m, i32 zeroext %evl) @@ -1353,15 +1350,14 @@ declare @llvm.vp.fshl.nxv1i4(, @fshl_v1i4( %a, %b, %c, %m, i32 zeroext %evl) { ; CHECK-LABEL: fshl_v1i4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vand.vi v10, v10, 15 -; CHECK-NEXT: li a1, 4 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vremu.vx v10, v10, a1, v0.t +; CHECK-NEXT: vand.vi v10, v10, 15, v0.t ; CHECK-NEXT: vand.vi v9, v9, 15, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t +; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vremu.vx v9, v10, a0, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll index 26089706cf99ef..a4b7ca7f39768f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll @@ -9,11 +9,15 @@ declare @llvm.vp.sdiv.nxv8i7(, @vdiv_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vdiv_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vdiv.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll index f41b885a66eaae..67c3f9dbf2869a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll @@ -10,11 +10,12 @@ define @vdivu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vdivu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vdivu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll index 8a76467986620c..c15caa31bb0986 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll @@ -9,11 +9,15 @@ declare @llvm.vp.smax.nxv8i7(, @vmax_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vmax_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll index 1c74887c1b20fb..df494f8af7387c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll @@ -10,11 +10,12 @@ define @vmaxu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vmaxu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vmaxu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll index 1c71242c3c7d79..794a21c7c6abac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll @@ -9,11 +9,15 @@ declare @llvm.vp.smin.nxv8i7(, @vmin_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vmin_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vmin.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll index 6d89a9777cf917..d54de281a7fd28 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll @@ -10,11 +10,12 @@ define @vminu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vminu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vminu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll index cf85fd827b51f1..2ef96f4b3896fc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll @@ -9,11 +9,15 @@ declare @llvm.vp.srem.nxv8i7(, @vrem_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vrem_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vrem.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll index 61bdd5b8d3c8a7..1f1ed4a1269acb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll @@ -10,11 +10,12 @@ define @vremu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vremu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vremu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll index c04d5ea2da3c1b..380835494ed17d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll @@ -12,8 +12,8 @@ define @vsll_vx_nxv8i7( %a, i7 signext %b, poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll index 632c4db5c5bb57..cff8cc710d21f3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll @@ -9,13 +9,14 @@ declare @llvm.vp.ashr.nxv8i7(, @vsra_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { ; CHECK-LABEL: vsra_vx_nxv8i7: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vsra.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll index ec5b7f3faf7ca8..ff6771b643031f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll @@ -10,11 +10,12 @@ define @vsrl_vx_nxv8i7( %a, i7 signext %b, poison, i7 %b, i32 0 From 0e96eebc7f681a7ce41f35909e609c7c61a11455 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Tue, 28 May 2024 12:52:45 -0700 Subject: [PATCH 56/89] [msan] Reland: Increase k num stack origin descrs (limited to non-PowerPC) (#93117) The original pull request (https://github.com/llvm/llvm-project/pull/92838) was reverted due to a PowerPC buildbot breakage (https://github.com/llvm/llvm-project/commit/df626dd11c360c58eddae813ce6a0524d0a53696). This reland limits the scope of the change to non-PowerPC platforms. I am unaware of any PowerPC use cases that would benefit from a larger kNumStackOriginDescrs constant. Original CL description: This increases the constant size of kNumStackOriginDescrs to 4M (64GB of BSS across two arrays), which ought to be enough for anybody. This is the easier alternative suggested by eugenis@ in https://github.com/llvm/llvm-project/pull/92826. --- compiler-rt/lib/msan/msan.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp index a2fc27de1901b4..9375e27d4f4d24 100644 --- a/compiler-rt/lib/msan/msan.cpp +++ b/compiler-rt/lib/msan/msan.cpp @@ -100,7 +100,17 @@ int msan_report_count = 0; // Array of stack origins. // FIXME: make it resizable. -static const uptr kNumStackOriginDescrs = 1024 * 1024; +// Although BSS memory doesn't cost anything until used, it is limited to 2GB +// in some configurations (e.g., "relocation R_X86_64_PC32 out of range: +// ... is not in [-2147483648, 2147483647]; references section '.bss'"). +// We use kNumStackOriginDescrs * (sizeof(char*) + sizeof(uptr)) == 64MB. +#ifdef SANITIZER_PPC +// soft_rss_limit test (release_origin.c) fails on PPC if kNumStackOriginDescrs +// is too high +static const uptr kNumStackOriginDescrs = 1 * 1024 * 1024; +#else +static const uptr kNumStackOriginDescrs = 4 * 1024 * 1024; +#endif // SANITIZER_PPC static const char *StackOriginDescr[kNumStackOriginDescrs]; static uptr StackOriginPC[kNumStackOriginDescrs]; static atomic_uint32_t NumStackOriginDescrs; From d9dec109375ded13d61da20877c399fb8fbb877d Mon Sep 17 00:00:00 2001 From: Lucile Rose Nihlen Date: Tue, 28 May 2024 19:53:21 +0000 Subject: [PATCH 57/89] [ci] limit parallel windows compile jobs to 24 (#93329) This is an experiment to see if we can prevent some of the compiler OOMs happening without unduly impacting the Windows build latency. --- .ci/monolithic-windows.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh index 4fd88ea81c84a8..91e719c52d4363 100755 --- a/.ci/monolithic-windows.sh +++ b/.ci/monolithic-windows.sh @@ -44,6 +44,8 @@ pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt # see https://github.com/llvm/llvm-project/pull/82393 and # https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40 # for further information. +# We limit the number of parallel compile jobs to 24 control memory +# consumption and improve build reliability. cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_ENABLE_PROJECTS="${projects}" \ -G Ninja \ @@ -58,7 +60,9 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D MLIR_ENABLE_BINDINGS_PYTHON=ON \ -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \ -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \ - -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" + -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \ + -D LLVM_PARALLEL_COMPILE_JOBS=16 \ + -D LLVM_PARALLEL_LINK_JOBS=4 echo "--- ninja" # Targets are not escaped as they are passed as separate arguments. From c96860aea2c77392bad16f1c4f55014164669de3 Mon Sep 17 00:00:00 2001 From: Piotr Zegar Date: Tue, 28 May 2024 22:09:34 +0200 Subject: [PATCH 58/89] [clang-tidy] Optimize realpath in readability-identifier-naming (#92659) - Reduce disk IO usage by adding cache to an realpath introduced by #81985 --- .../clang-tidy/readability/IdentifierNamingCheck.cpp | 12 ++++++++++-- .../clang-tidy/readability/IdentifierNamingCheck.h | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp index c3208392df1566..828f13805a6980 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp @@ -1414,13 +1414,21 @@ IdentifierNamingCheck::getDiagInfo(const NamingCheckId &ID, }}; } +StringRef IdentifierNamingCheck::getRealFileName(StringRef FileName) const { + auto Iter = RealFileNameCache.try_emplace(FileName); + SmallString<256U> &RealFileName = Iter.first->getValue(); + if (!Iter.second) + return RealFileName; + llvm::sys::fs::real_path(FileName, RealFileName); + return RealFileName; +} + const IdentifierNamingCheck::FileStyle & IdentifierNamingCheck::getStyleForFile(StringRef FileName) const { if (!GetConfigPerFile) return *MainFileStyle; - SmallString<128> RealFileName; - llvm::sys::fs::real_path(FileName, RealFileName); + StringRef RealFileName = getRealFileName(FileName); StringRef Parent = llvm::sys::path::parent_path(RealFileName); auto Iter = NamingStylesCache.find(Parent); if (Iter != NamingStylesCache.end()) diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h index 27c8e4bc768c40..646ec0eac8dd1c 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h +++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h @@ -205,6 +205,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck { const NamingCheckFailure &Failure) const override; const FileStyle &getStyleForFile(StringRef FileName) const; + StringRef getRealFileName(StringRef FileName) const; /// Find the style kind of a field in an anonymous record. StyleKind findStyleKindForAnonField( @@ -222,6 +223,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck { /// Stores the style options as a vector, indexed by the specified \ref /// StyleKind, for a given directory. mutable llvm::StringMap NamingStylesCache; + mutable llvm::StringMap> RealFileNameCache; FileStyle *MainFileStyle; ClangTidyContext *Context; const bool GetConfigPerFile; From 0aacef3abc41cfc8efb5f1b9483bc37599352a59 Mon Sep 17 00:00:00 2001 From: Mattan Elkaim <73639004+mattanelkaim@users.noreply.github.com> Date: Tue, 28 May 2024 23:19:01 +0300 Subject: [PATCH 59/89] [clang-tidy][NFC] Update identifier-length.rst (#93467) Swapped code blocks of parameter and variable, which have been confused (in a clang-tidy doc file) --- .../checks/readability/identifier-length.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst index 44d97f7b363bff..271970c292c8fa 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst @@ -28,10 +28,7 @@ The following options are described below: .. code-block:: c++ - int doubler(int x) // warns that x is too short - { - return 2 * x; - } + int i = 42; // warns that 'i' is too short This check does not have any fix suggestions in the general case since variable names have semantic value. @@ -50,7 +47,10 @@ The following options are described below: .. code-block:: c++ - int i = 42; // warns that 'i' is too short + int doubler(int x) // warns that x is too short + { + return 2 * x; + } This check does not have any fix suggestions in the general case since variable names have semantic value. From c108c1e94580d70e2be66172ab4397fcff004376 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 28 May 2024 13:26:36 -0700 Subject: [PATCH 60/89] [WebAssembly] Rename old EH tests to *-legacy (#93585) I think test files for the legacy and the new EH (exnref) are better be separate, and I'd like to use the current test file names for the new EH, rather than keeping the current files and naming the new ones as `-new` or something. --- .../WebAssembly/{cfg-stackify-eh.ll => cfg-stackify-eh-legacy.ll} | 0 .../CodeGen/WebAssembly/{exception.ll => exception-legacy.ll} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename llvm/test/CodeGen/WebAssembly/{cfg-stackify-eh.ll => cfg-stackify-eh-legacy.ll} (100%) rename llvm/test/CodeGen/WebAssembly/{exception.ll => exception-legacy.ll} (100%) diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll similarity index 100% rename from llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll rename to llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll similarity index 100% rename from llvm/test/CodeGen/WebAssembly/exception.ll rename to llvm/test/CodeGen/WebAssembly/exception-legacy.ll From 9e89d107a6ec2ade15eddb549fa473cf09bf230e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 May 2024 13:30:00 -0700 Subject: [PATCH 61/89] [memprof] Add MemProf format Version 3 (#93608) This patch adds Version 3 for development purposes. For now, this patch adds V3 as a copy of V2. For the most part, this patch adds "case Version3:" wherever "case Version2:" appears. One exception is writeMemProfV3, which is copied from writeMemProfV2 but updated to write out memprof::Version3 to the MemProf header. We'll incrementally modify writeMemProfV3 in subsequent patches. --- llvm/include/llvm/ProfileData/MemProf.h | 4 +- llvm/lib/ProfileData/InstrProfReader.cpp | 4 +- llvm/lib/ProfileData/InstrProfWriter.cpp | 52 +++++++++++++++++++ llvm/lib/ProfileData/MemProf.cpp | 4 ++ .../llvm-profdata/memprof-merge-versions.test | 6 +++ llvm/tools/llvm-profdata/llvm-profdata.cpp | 3 +- 6 files changed, 70 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 17cef15344285b..d44a2d1e2fb117 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -28,10 +28,12 @@ enum IndexedVersion : uint64_t { Version1 = 1, // Version 2: Added a call stack table. Version2 = 2, + // Version 3: Under development. + Version3 = 3, }; constexpr uint64_t MinimumSupportedVersion = Version0; -constexpr uint64_t MaximumSupportedVersion = Version2; +constexpr uint64_t MaximumSupportedVersion = Version3; // Verify that the minimum and maximum satisfy the obvious constraint. static_assert(MinimumSupportedVersion <= MaximumSupportedVersion); diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 836206a4fd86e2..798236c295194a 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -1212,7 +1212,8 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start, const uint64_t FirstWord = support::endian::readNext(Ptr); - if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2) { + if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2 || + FirstWord == memprof::Version3) { // Everything is good. We can proceed to deserialize the rest. Version = static_cast(FirstWord); } else if (FirstWord >= 24) { @@ -1559,6 +1560,7 @@ IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const { "MemProfCallStackTable must not be available"); return getMemProfRecordV0(IndexedRecord, *MemProfFrameTable); case memprof::Version2: + case memprof::Version3: assert(MemProfFrameTable && "MemProfFrameTable must be available"); assert(MemProfCallStackTable && "MemProfCallStackTable must be available"); return getMemProfRecordV2(IndexedRecord, *MemProfFrameTable, diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index b67a9700b680ab..b16714ae8b9a2d 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -617,6 +617,56 @@ static Error writeMemProfV2(ProfOStream &OS, return Error::success(); } +// Write out MemProf Version3 as follows: +// uint64_t Version +// uint64_t RecordTableOffset = RecordTableGenerator.Emit +// uint64_t FramePayloadOffset = Offset for the frame payload +// uint64_t FrameTableOffset = FrameTableGenerator.Emit +// uint64_t CallStackPayloadOffset = Offset for the call stack payload +// uint64_t CallStackTableOffset = CallStackTableGenerator.Emit +// uint64_t Num schema entries +// uint64_t Schema entry 0 +// uint64_t Schema entry 1 +// .... +// uint64_t Schema entry N - 1 +// OnDiskChainedHashTable MemProfRecordData +// OnDiskChainedHashTable MemProfFrameData +// OnDiskChainedHashTable MemProfCallStackData +static Error writeMemProfV3(ProfOStream &OS, + memprof::IndexedMemProfData &MemProfData, + bool MemProfFullSchema) { + OS.write(memprof::Version3); + uint64_t HeaderUpdatePos = OS.tell(); + OS.write(0ULL); // Reserve space for the memprof record table offset. + OS.write(0ULL); // Reserve space for the memprof frame payload offset. + OS.write(0ULL); // Reserve space for the memprof frame table offset. + OS.write(0ULL); // Reserve space for the memprof call stack payload offset. + OS.write(0ULL); // Reserve space for the memprof call stack table offset. + + auto Schema = memprof::getHotColdSchema(); + if (MemProfFullSchema) + Schema = memprof::getFullSchema(); + writeMemProfSchema(OS, Schema); + + uint64_t RecordTableOffset = writeMemProfRecords(OS, MemProfData.RecordData, + &Schema, memprof::Version3); + + uint64_t FramePayloadOffset = OS.tell(); + uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfData.FrameData); + + uint64_t CallStackPayloadOffset = OS.tell(); + uint64_t CallStackTableOffset = + writeMemProfCallStacks(OS, MemProfData.CallStackData); + + uint64_t Header[] = { + RecordTableOffset, FramePayloadOffset, FrameTableOffset, + CallStackPayloadOffset, CallStackTableOffset, + }; + OS.patch({{HeaderUpdatePos, Header, std::size(Header)}}); + + return Error::success(); +} + // Write out the MemProf data in a requested version. static Error writeMemProf(ProfOStream &OS, memprof::IndexedMemProfData &MemProfData, @@ -629,6 +679,8 @@ static Error writeMemProf(ProfOStream &OS, return writeMemProfV1(OS, MemProfData); case memprof::Version2: return writeMemProfV2(OS, MemProfData, MemProfFullSchema); + case memprof::Version3: + return writeMemProfV3(OS, MemProfData, MemProfFullSchema); } return make_error( diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index 89afe7c39027c6..2f0e53736c82e5 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -52,6 +52,7 @@ size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema, case Version1: return serializedSizeV0(*this, Schema); case Version2: + case Version3: return serializedSizeV2(*this, Schema); } llvm_unreachable("unsupported MemProf version"); @@ -95,6 +96,7 @@ size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema, case Version1: return serializedSizeV0(*this, Schema); case Version2: + case Version3: return serializedSizeV2(*this, Schema); } llvm_unreachable("unsupported MemProf version"); @@ -149,6 +151,7 @@ void IndexedMemProfRecord::serialize(const MemProfSchema &Schema, serializeV0(*this, Schema, OS); return; case Version2: + case Version3: serializeV2(*this, Schema, OS); return; } @@ -239,6 +242,7 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema, case Version1: return deserializeV0(Schema, Ptr); case Version2: + case Version3: return deserializeV2(Schema, Ptr); } llvm_unreachable("unsupported MemProf version"); diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-versions.test b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test index 28f65e0781bc63..aa7d0329425dc5 100644 --- a/llvm/test/tools/llvm-profdata/memprof-merge-versions.test +++ b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test @@ -19,6 +19,12 @@ RUN: llvm-profdata show %t.prof.v2 | FileCheck %s RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=2 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v2 RUN: llvm-profdata show %t.prof.v2 | FileCheck %s +RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3 +RUN: llvm-profdata show %t.prof.v3 | FileCheck %s + +RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3 +RUN: llvm-profdata show %t.prof.v3 | FileCheck %s + For now we only check the validity of the instrumented profile since we don't have a way to display the contents of the memprof indexed format yet. diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index 28c3afa1016473..fae6d1e989ab5a 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -306,7 +306,8 @@ cl::opt MemProfVersionRequested( cl::init(memprof::Version0), cl::values(clEnumValN(memprof::Version0, "0", "version 0"), clEnumValN(memprof::Version1, "1", "version 1"), - clEnumValN(memprof::Version2, "2", "version 2"))); + clEnumValN(memprof::Version2, "2", "version 2"), + clEnumValN(memprof::Version3, "3", "version 3"))); cl::opt MemProfFullSchema( "memprof-full-schema", cl::Hidden, cl::sub(MergeSubcommand), From 193e9007ef0bef6c881ab26746221f22ec674447 Mon Sep 17 00:00:00 2001 From: erichkeane Date: Tue, 28 May 2024 13:18:46 -0700 Subject: [PATCH 62/89] [OpenACC][NFC] Fix begin loc and split it from the directive location I discovered while working on something else that we were using the location of the directive name as the 'beginloc' which caused some problems in a few places. This patch makes it so our beginloc is the '#' as we originally designed, and then adds a DirectiveLoc concept to a construct for use diagnosing the name. --- clang/include/clang/AST/StmtOpenACC.h | 32 ++++++++++++++--------- clang/include/clang/Parse/Parser.h | 1 + clang/include/clang/Sema/SemaOpenACC.h | 3 ++- clang/lib/AST/StmtOpenACC.cpp | 13 +++++---- clang/lib/Parse/ParseOpenACC.cpp | 19 +++++++------- clang/lib/Sema/SemaOpenACC.cpp | 7 ++--- clang/lib/Sema/TreeTransform.h | 9 ++++--- clang/lib/Serialization/ASTReaderStmt.cpp | 1 + clang/lib/Serialization/ASTWriterStmt.cpp | 1 + 9 files changed, 49 insertions(+), 37 deletions(-) diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h index b706864798baaf..04daf511f58713 100644 --- a/clang/include/clang/AST/StmtOpenACC.h +++ b/clang/include/clang/AST/StmtOpenACC.h @@ -31,6 +31,8 @@ class OpenACCConstructStmt : public Stmt { /// The location of the directive statement, from the '#' to the last token of /// the directive. SourceRange Range; + /// The location of the directive name. + SourceLocation DirectiveLoc; /// The list of clauses. This is stored here as an ArrayRef, as this is the /// most convienient place to access the list, however the list itself should @@ -39,8 +41,9 @@ class OpenACCConstructStmt : public Stmt { protected: OpenACCConstructStmt(StmtClass SC, OpenACCDirectiveKind K, - SourceLocation Start, SourceLocation End) - : Stmt(SC), Kind(K), Range(Start, End) {} + SourceLocation Start, SourceLocation DirectiveLoc, + SourceLocation End) + : Stmt(SC), Kind(K), Range(Start, End), DirectiveLoc(DirectiveLoc) {} // Used only for initialization, the leaf class can initialize this to // trailing storage. @@ -59,6 +62,7 @@ class OpenACCConstructStmt : public Stmt { SourceLocation getBeginLoc() const { return Range.getBegin(); } SourceLocation getEndLoc() const { return Range.getEnd(); } + SourceLocation getDirectiveLoc() const { return DirectiveLoc; } ArrayRef clauses() const { return Clauses; } child_range children() { @@ -81,9 +85,11 @@ class OpenACCAssociatedStmtConstruct : public OpenACCConstructStmt { protected: OpenACCAssociatedStmtConstruct(StmtClass SC, OpenACCDirectiveKind K, - SourceLocation Start, SourceLocation End, - Stmt *AssocStmt) - : OpenACCConstructStmt(SC, K, Start, End), AssociatedStmt(AssocStmt) {} + SourceLocation Start, + SourceLocation DirectiveLoc, + SourceLocation End, Stmt *AssocStmt) + : OpenACCConstructStmt(SC, K, Start, DirectiveLoc, End), + AssociatedStmt(AssocStmt) {} void setAssociatedStmt(Stmt *S) { AssociatedStmt = S; } Stmt *getAssociatedStmt() { return AssociatedStmt; } @@ -126,10 +132,10 @@ class OpenACCComputeConstruct final friend class ASTStmtReader; friend class ASTContext; OpenACCComputeConstruct(unsigned NumClauses) - : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, - OpenACCDirectiveKind::Invalid, - SourceLocation{}, SourceLocation{}, - /*AssociatedStmt=*/nullptr) { + : OpenACCAssociatedStmtConstruct( + OpenACCComputeConstructClass, OpenACCDirectiveKind::Invalid, + SourceLocation{}, SourceLocation{}, SourceLocation{}, + /*AssociatedStmt=*/nullptr) { // We cannot send the TrailingObjects storage to the base class (which holds // a reference to the data) until it is constructed, so we have to set it // separately here. @@ -141,11 +147,11 @@ class OpenACCComputeConstruct final } OpenACCComputeConstruct(OpenACCDirectiveKind K, SourceLocation Start, - SourceLocation End, + SourceLocation DirectiveLoc, SourceLocation End, ArrayRef Clauses, Stmt *StructuredBlock) : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, K, Start, - End, StructuredBlock) { + DirectiveLoc, End, StructuredBlock) { assert(isOpenACCComputeDirectiveKind(K) && "Only parallel, serial, and kernels constructs should be " "represented by this type"); @@ -169,8 +175,8 @@ class OpenACCComputeConstruct final unsigned NumClauses); static OpenACCComputeConstruct * Create(const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc, - SourceLocation EndLoc, ArrayRef Clauses, - Stmt *StructuredBlock); + SourceLocation DirectiveLoc, SourceLocation EndLoc, + ArrayRef Clauses, Stmt *StructuredBlock); Stmt *getStructuredBlock() { return getAssociatedStmt(); } const Stmt *getStructuredBlock() const { diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 00b475e5b42824..d054b8cf0d2405 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -3659,6 +3659,7 @@ class Parser : public CodeCompletionHandler { struct OpenACCDirectiveParseInfo { OpenACCDirectiveKind DirKind; SourceLocation StartLoc; + SourceLocation DirLoc; SourceLocation EndLoc; SmallVector Clauses; // TODO OpenACC: As we implement support for the Atomic, Routine, Cache, and diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h index 6f69fa08939b82..66144de4340a8a 100644 --- a/clang/include/clang/Sema/SemaOpenACC.h +++ b/clang/include/clang/Sema/SemaOpenACC.h @@ -379,7 +379,7 @@ class SemaOpenACC : public SemaBase { /// Called after the construct has been parsed, but clauses haven't been /// parsed. This allows us to diagnose not-implemented, as well as set up any /// state required for parsing the clauses. - void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation StartLoc); + void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation DirLoc); /// Called after the directive, including its clauses, have been parsed and /// parsing has consumed the 'annot_pragma_openacc_end' token. This DOES @@ -400,6 +400,7 @@ class SemaOpenACC : public SemaBase { /// declaration group or associated statement. StmtResult ActOnEndStmtDirective(OpenACCDirectiveKind K, SourceLocation StartLoc, + SourceLocation DirLoc, SourceLocation EndLoc, ArrayRef Clauses, StmtResult AssocStmt); diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp index a381a8dd7b62c3..47899b344c97ab 100644 --- a/clang/lib/AST/StmtOpenACC.cpp +++ b/clang/lib/AST/StmtOpenACC.cpp @@ -23,15 +23,14 @@ OpenACCComputeConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) { return Inst; } -OpenACCComputeConstruct * -OpenACCComputeConstruct::Create(const ASTContext &C, OpenACCDirectiveKind K, - SourceLocation BeginLoc, SourceLocation EndLoc, - ArrayRef Clauses, - Stmt *StructuredBlock) { +OpenACCComputeConstruct *OpenACCComputeConstruct::Create( + const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc, + SourceLocation DirLoc, SourceLocation EndLoc, + ArrayRef Clauses, Stmt *StructuredBlock) { void *Mem = C.Allocate( OpenACCComputeConstruct::totalSizeToAlloc( Clauses.size())); - auto *Inst = new (Mem) - OpenACCComputeConstruct(K, BeginLoc, EndLoc, Clauses, StructuredBlock); + auto *Inst = new (Mem) OpenACCComputeConstruct(K, BeginLoc, DirLoc, EndLoc, + Clauses, StructuredBlock); return Inst; } diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index e9c60f76165b68..63afc18783a1f7 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -1347,11 +1347,13 @@ void Parser::ParseOpenACCCacheVarList() { ParseOpenACCVarList(OpenACCClauseKind::Invalid); } -Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() { - SourceLocation StartLoc = getCurToken().getLocation(); +Parser::OpenACCDirectiveParseInfo +Parser::ParseOpenACCDirective() { + SourceLocation StartLoc = ConsumeAnnotationToken(); + SourceLocation DirLoc = getCurToken().getLocation(); OpenACCDirectiveKind DirKind = ParseOpenACCDirectiveKind(*this); - getActions().OpenACC().ActOnConstruct(DirKind, StartLoc); + getActions().OpenACC().ActOnConstruct(DirKind, DirLoc); // Once we've parsed the construct/directive name, some have additional // specifiers that need to be taken care of. Atomic has an 'atomic-clause' @@ -1390,7 +1392,7 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() { break; case OpenACCDirectiveKind::Wait: // OpenACC has an optional paren-wrapped 'wait-argument'. - if (ParseOpenACCWaitArgument(StartLoc, /*IsDirective=*/true).Failed) + if (ParseOpenACCWaitArgument(DirLoc, /*IsDirective=*/true).Failed) T.skipToEnd(); else T.consumeClose(); @@ -1404,7 +1406,8 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() { } // Parses the list of clauses, if present, plus set up return value. - OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, SourceLocation{}, + OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, DirLoc, + SourceLocation{}, ParseOpenACCClauseList(DirKind)}; assert(Tok.is(tok::annot_pragma_openacc_end) && @@ -1421,7 +1424,6 @@ Parser::DeclGroupPtrTy Parser::ParseOpenACCDirectiveDecl() { assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token"); ParsingOpenACCDirectiveRAII DirScope(*this); - ConsumeAnnotationToken(); OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective(); @@ -1438,7 +1440,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() { assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token"); ParsingOpenACCDirectiveRAII DirScope(*this); - ConsumeAnnotationToken(); OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective(); if (getActions().OpenACC().ActOnStartStmtDirective(DirInfo.DirKind, @@ -1456,6 +1457,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() { } return getActions().OpenACC().ActOnEndStmtDirective( - DirInfo.DirKind, DirInfo.StartLoc, DirInfo.EndLoc, DirInfo.Clauses, - AssocStmt); + DirInfo.DirKind, DirInfo.StartLoc, DirInfo.DirLoc, DirInfo.EndLoc, + DirInfo.Clauses, AssocStmt); } diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 09d91b31cfe5f9..15239f4f35c39f 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -844,7 +844,7 @@ ExprResult SemaOpenACC::CheckReductionVar(Expr *VarExpr) { } void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K, - SourceLocation StartLoc) { + SourceLocation DirLoc) { switch (K) { case OpenACCDirectiveKind::Invalid: // Nothing to do here, an invalid kind has nothing we can check here. We @@ -859,7 +859,7 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K, // here as these constructs do not take any arguments. break; default: - Diag(StartLoc, diag::warn_acc_construct_unimplemented) << K; + Diag(DirLoc, diag::warn_acc_construct_unimplemented) << K; break; } } @@ -1265,6 +1265,7 @@ bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K, StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K, SourceLocation StartLoc, + SourceLocation DirLoc, SourceLocation EndLoc, ArrayRef Clauses, StmtResult AssocStmt) { @@ -1278,7 +1279,7 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K, case OpenACCDirectiveKind::Kernels: // TODO OpenACC: Add clauses to the construct here. return OpenACCComputeConstruct::Create( - getASTContext(), K, StartLoc, EndLoc, Clauses, + getASTContext(), K, StartLoc, DirLoc, EndLoc, Clauses, AssocStmt.isUsable() ? AssocStmt.get() : nullptr); } llvm_unreachable("Unhandled case in directive handling?"); diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index dee335b526991b..765e6177d202d1 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -4033,11 +4033,12 @@ class TreeTransform { StmtResult RebuildOpenACCComputeConstruct(OpenACCDirectiveKind K, SourceLocation BeginLoc, + SourceLocation DirLoc, SourceLocation EndLoc, ArrayRef Clauses, StmtResult StrBlock) { - return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, EndLoc, - Clauses, StrBlock); + return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, DirLoc, + EndLoc, Clauses, StrBlock); } private: @@ -11559,8 +11560,8 @@ StmtResult TreeTransform::TransformOpenACCComputeConstruct( getSema().OpenACC().ActOnAssociatedStmt(C->getDirectiveKind(), StrBlock); return getDerived().RebuildOpenACCComputeConstruct( - C->getDirectiveKind(), C->getBeginLoc(), C->getEndLoc(), - TransformedClauses, StrBlock); + C->getDirectiveKind(), C->getBeginLoc(), C->getDirectiveLoc(), + C->getEndLoc(), TransformedClauses, StrBlock); } //===----------------------------------------------------------------------===// diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index eac4faff285490..bea2b949891070 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2797,6 +2797,7 @@ void ASTStmtReader::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) { (void)Record.readInt(); S->Kind = Record.readEnum(); S->Range = Record.readSourceRange(); + S->DirectiveLoc = Record.readSourceLocation(); Record.readOpenACCClauseList(S->Clauses); } diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index a44852af97bea3..3c586b270fbf4f 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2847,6 +2847,7 @@ void ASTStmtWriter::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) { Record.push_back(S->clauses().size()); Record.writeEnum(S->Kind); Record.AddSourceRange(S->Range); + Record.AddSourceLocation(S->DirectiveLoc); Record.writeOpenACCClauseList(S->clauses()); } From 5a23d31c5033dcb41d374692ed26d87ed8e2665a Mon Sep 17 00:00:00 2001 From: William Junda Huang Date: Tue, 28 May 2024 16:41:53 -0400 Subject: [PATCH 63/89] [Sample Profile] Check hot callsite threshold when inlining a function with a sample profile (#93286) Currently if a callsite is hot as determined by the sample profile, it is unconditionally inlined barring invalid cases (such as recursion). Inline cost check should still apply because a function's hotness and its inline cost are two different things. For example if a function is calling another very large function multiple times (at different code paths), the large function should not be inlined even if its hot. --- llvm/lib/Transforms/IPO/SampleProfile.cpp | 7 ++- .../Inputs/inline-hot-callsite-threshold.prof | 3 + .../inline-hot-callsite-threshold.ll | 61 +++++++++++++++++++ .../SampleProfile/pseudo-probe-inline.ll | 2 +- llvm/test/Transforms/SampleProfile/remarks.ll | 4 +- 5 files changed, 71 insertions(+), 6 deletions(-) create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof create mode 100644 llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 0920179fb76b73..92ad4c34da6e7e 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -1391,10 +1391,11 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { return InlineCost::getAlways("preinliner"); } - // For old FDO inliner, we inline the call site as long as cost is not - // "Never". The cost-benefit check is done earlier. + // For old FDO inliner, we inline the call site if it is below hot threshold, + // even if the function is hot based on sample profile data. This is to + // prevent huge functions from being inlined. if (!CallsitePrioritizedInline) { - return InlineCost::get(Cost.getCost(), INT_MAX); + return InlineCost::get(Cost.getCost(), SampleHotCallSiteThreshold); } // Otherwise only use the cost from call analyzer, but overwite threshold with diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof new file mode 100644 index 00000000000000..d1c0408210f498 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof @@ -0,0 +1,3 @@ +foo:100:100 + 1: bar:100 + 1:100 diff --git a/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll new file mode 100644 index 00000000000000..914ab4f1e3da58 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=100 2>&1 | FileCheck %s + +; CHECK: remark: a.cc:6:12: 'bar' inlined into 'foo' to match profiling context with (cost={{.*}}, threshold=100) +; CHECK: define dso_local noundef i32 @foo(i32 noundef %0) +; CHECK-NOT: %2 = tail call noundef i32 @bar(i32 noundef %0) +; CHECK-NEXT: %2 = icmp sgt i32 %0, 1 +; CHECK-NEXT: br i1 %2, label %3, label %bar.exit + +; Manually lower cost threshold for hot function inlining, so that the function +; is not inlined even profile indicates it as hot. +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=1 2>&1 | FileCheck %s --check-prefix=COST + +; COST-NOT: remark +; COST: define dso_local noundef i32 @foo(i32 noundef %0) +; COST-NEXT: %2 = tail call noundef i32 @bar(i32 noundef %0) + +define dso_local noundef i32 @bar(i32 noundef %0) #0 !dbg !10 { + %2 = icmp sgt i32 %0, 1 + br i1 %2, label %3, label %15 +3: ; preds = %1 + %4 = add nsw i32 %0, -2 + %5 = mul i32 %4, %4 + %6 = add i32 %5, %0 + %7 = zext nneg i32 %4 to i33 + %8 = add nsw i32 %0, -3 + %9 = zext i32 %8 to i33 + %10 = mul i33 %7, %9 + %11 = lshr i33 %10, 1 + %12 = trunc nuw i33 %11 to i32 + %13 = xor i32 %12, -1 + %14 = add i32 %6, %13 + br label %15 +15: ; preds = %3, %1 + %16 = phi i32 [ 0, %1 ], [ %14, %3 ] + ret i32 %16 +} + +define dso_local noundef i32 @foo(i32 noundef %0) #1 !dbg !20 { + %2 = tail call noundef i32 @bar(i32 noundef %0), !dbg !24 + ret i32 %2 +} + +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "use-sample-profile" } +attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "use-sample-profile" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug) +!1 = !DIFile(filename: "a.cc", directory: ".") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!10 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: !1, file: !1, line: 1, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!11 = !DIFile(filename: "a.cc", directory: ".") +!12 = !DISubroutineType(types: !13) +!13 = !{!14, !14} +!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !11, file: !11, line: 5, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!23 = !DILocation(line: 0, scope: !20) +!24 = !DILocation(line: 6, column: 12, scope: !20) diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll index 18cbd857d97bb2..2cd9abf0e11e94 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll @@ -98,7 +98,7 @@ if.end: ;YAML-NEXT: - String: '(cost=' ;YAML-NEXT: - Cost: '15' ;YAML-NEXT: - String: ', threshold=' -;YAML-NEXT: - Threshold: '2147483647' +;YAML-NEXT: - Threshold: '3000' ;YAML-NEXT: - String: ')' ;YAML-NEXT: - String: ' at callsite ' ;YAML-NEXT: - String: foo diff --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll index 997e02bb5b5444..9c0143ae65ca77 100644 --- a/llvm/test/Transforms/SampleProfile/remarks.ll +++ b/llvm/test/Transforms/SampleProfile/remarks.ll @@ -22,7 +22,7 @@ ; We are expecting foo() to be inlined in main() (almost all the cycles are ; spent inside foo). -; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=2147483647) at callsite main:0:21; +; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=3000) at callsite main:0:21; ; CHECK: remark: remarks.cc:9:19: 'rand' inlined into 'main' to match profiling context with (cost=always): always inline attribute at callsite _Z3foov:6:19 @ main:0:21; ; The back edge for the loop is the hottest edge in the loop subgraph. @@ -51,7 +51,7 @@ ;YAML-NEXT: - String: '(cost=' ;YAML-NEXT: - Cost: '130' ;YAML-NEXT: - String: ', threshold=' -;YAML-NEXT: - Threshold: '2147483647' +;YAML-NEXT: - Threshold: '3000' ;YAML-NEXT: - String: ')' ;YAML-NEXT: - String: ' at callsite ' ;YAML-NEXT: - String: main From 6a47315a3cb2c6d381809f0ba5c89bd8dcdbcaa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= Date: Tue, 28 May 2024 22:45:32 +0200 Subject: [PATCH 64/89] [clang-repl] Even more tests create the Interpreter and must check host JIT support (#84758) --- .../Interpreter/CodeCompletionTest.cpp | 85 +++++++++++++++++++ .../Interpreter/IncrementalProcessingTest.cpp | 3 + 2 files changed, 88 insertions(+) diff --git a/clang/unittests/Interpreter/CodeCompletionTest.cpp b/clang/unittests/Interpreter/CodeCompletionTest.cpp index 873fbda32f0579..72c02c683fafd4 100644 --- a/clang/unittests/Interpreter/CodeCompletionTest.cpp +++ b/clang/unittests/Interpreter/CodeCompletionTest.cpp @@ -4,6 +4,7 @@ #include "clang/Lex/Preprocessor.h" #include "clang/Sema/CodeCompleteConsumer.h" #include "clang/Sema/Sema.h" +#include "llvm/ExecutionEngine/Orc/LLJIT.h" #include "llvm/LineEditor/LineEditor.h" #include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" @@ -11,6 +12,10 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +#if defined(_AIX) || defined(__MVS__) +#define CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +#endif + using namespace clang; namespace { auto CB = clang::IncrementalCompilerBuilder(); @@ -50,7 +55,21 @@ static std::vector runComp(clang::Interpreter &MainInterp, return Comps; } +static bool HostSupportsJit() { + auto J = llvm::orc::LLJITBuilder().create(); + if (J) + return true; + LLVMConsumeError(llvm::wrap(J.takeError())); + return false; +} + +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_Sanity) { +#else TEST(CodeCompletionTest, Sanity) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int foo = 12;")); auto Err = llvm::Error::success(); @@ -61,7 +80,13 @@ TEST(CodeCompletionTest, Sanity) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_SanityNoneValid) { +#else TEST(CodeCompletionTest, SanityNoneValid) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int foo = 12;")); auto Err = llvm::Error::success(); @@ -70,7 +95,13 @@ TEST(CodeCompletionTest, SanityNoneValid) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_TwoDecls) { +#else TEST(CodeCompletionTest, TwoDecls) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int application = 12;")); cantFail(Interp->Parse("int apple = 12;")); @@ -80,14 +111,26 @@ TEST(CodeCompletionTest, TwoDecls) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_CompFunDeclsNoError) { +#else TEST(CodeCompletionTest, CompFunDeclsNoError) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); auto Err = llvm::Error::success(); auto comps = runComp(*Interp, "void app(", Err); EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_TypedDirected) { +#else TEST(CodeCompletionTest, TypedDirected) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int application = 12;")); cantFail(Interp->Parse("char apple = '2';")); @@ -119,7 +162,13 @@ TEST(CodeCompletionTest, TypedDirected) { } } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_SanityClasses) { +#else TEST(CodeCompletionTest, SanityClasses) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("struct Apple{};")); cantFail(Interp->Parse("void takeApple(Apple &a1){}")); @@ -142,7 +191,13 @@ TEST(CodeCompletionTest, SanityClasses) { } } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_SubClassing) { +#else TEST(CodeCompletionTest, SubClassing) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("struct Fruit {};")); cantFail(Interp->Parse("struct Apple : Fruit{};")); @@ -157,7 +212,13 @@ TEST(CodeCompletionTest, SubClassing) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_MultipleArguments) { +#else TEST(CodeCompletionTest, MultipleArguments) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse("int foo = 42;")); cantFail(Interp->Parse("char fowl = 'A';")); @@ -169,7 +230,13 @@ TEST(CodeCompletionTest, MultipleArguments) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_Methods) { +#else TEST(CodeCompletionTest, Methods) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse( "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};")); @@ -183,7 +250,13 @@ TEST(CodeCompletionTest, Methods) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_MethodsInvocations) { +#else TEST(CodeCompletionTest, MethodsInvocations) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse( "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};")); @@ -197,7 +270,13 @@ TEST(CodeCompletionTest, MethodsInvocations) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_NestedInvocations) { +#else TEST(CodeCompletionTest, NestedInvocations) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail(Interp->Parse( "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};")); @@ -212,7 +291,13 @@ TEST(CodeCompletionTest, NestedInvocations) { EXPECT_EQ((bool)Err, false); } +#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT +TEST(CodeCompletionTest, DISABLED_TemplateFunctions) { +#else TEST(CodeCompletionTest, TemplateFunctions) { +#endif + if (!HostSupportsJit()) + GTEST_SKIP(); auto Interp = createInterpreter(); cantFail( Interp->Parse("template T id(T a) { return a;} ")); diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp index f3b091b0c0e6cb..9a99ff6262fa3c 100644 --- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp +++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp @@ -61,6 +61,9 @@ TEST(IncrementalProcessing, DISABLED_EmitCXXGlobalInitFunc) { #else TEST(IncrementalProcessing, EmitCXXGlobalInitFunc) { #endif + if (!HostSupportsJit()) + GTEST_SKIP(); + std::vector ClangArgv = {"-Xclang", "-emit-llvm-only"}; auto CB = clang::IncrementalCompilerBuilder(); CB.SetCompilerArgs(ClangArgv); From 98fa0f6981f33b7d8f5aa38babc1e71bc0209de8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 28 May 2024 20:40:58 +0200 Subject: [PATCH 65/89] DAG: Handle vector splitting for fminnum_ieee/fmaxnum_ieee Avoids regression in future commit which starts producing illegal instances. --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 14e8708fd3f38f..361416edb554ca 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1174,8 +1174,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::FADD: case ISD::VP_FADD: case ISD::FSUB: case ISD::VP_FSUB: case ISD::FMUL: case ISD::VP_FMUL: - case ISD::FMINNUM: case ISD::VP_FMINNUM: - case ISD::FMAXNUM: case ISD::VP_FMAXNUM: + case ISD::FMINNUM: + case ISD::FMINNUM_IEEE: + case ISD::VP_FMINNUM: + case ISD::FMAXNUM: + case ISD::FMAXNUM_IEEE: + case ISD::VP_FMAXNUM: case ISD::FMINIMUM: case ISD::VP_FMINIMUM: case ISD::FMAXIMUM: From bbca20f0b1ab7c6ea36a84e88a6abb07f94ca80b Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Tue, 28 May 2024 23:04:12 +0200 Subject: [PATCH 66/89] [Clang][NFC] remove CHAR_PUNCT duplication introduced by #93216 (#93605) --- clang/include/clang/Basic/CharInfo.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h index 4d90528f7992e3..d71857e8e5dcc3 100644 --- a/clang/include/clang/Basic/CharInfo.h +++ b/clang/include/clang/Basic/CharInfo.h @@ -151,8 +151,7 @@ LLVM_READONLY inline bool isHexDigit(unsigned char c) { /// Note that '_' is both a punctuation character and an identifier character! LLVM_READONLY inline bool isPunctuation(unsigned char c) { using namespace charinfo; - return (InfoTable[c] & - (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT | CHAR_PUNCT)) != 0; + return (InfoTable[c] & (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT)) != 0; } /// Return true if this character is an ASCII printable character; that is, a From df542e1ed82bd4e5a9e345d3a3ae63a76893a0cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= Date: Tue, 28 May 2024 23:18:45 +0200 Subject: [PATCH 67/89] Fix build: [clang-repl] Even more tests create the Interpreter and must check host JIT support (#84758) fea7399e97b73a3209fcbe3338d412069769a637 had removed the unused function that was still there when I tested. --- clang/unittests/Interpreter/IncrementalProcessingTest.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp index 9a99ff6262fa3c..732753f11306e6 100644 --- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp +++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp @@ -56,6 +56,14 @@ const Function *getGlobalInit(llvm::Module *M) { return nullptr; } +static bool HostSupportsJit() { + auto J = llvm::orc::LLJITBuilder().create(); + if (J) + return true; + LLVMConsumeError(llvm::wrap(J.takeError())); + return false; +} + #ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT TEST(IncrementalProcessing, DISABLED_EmitCXXGlobalInitFunc) { #else From ed4227aad37f2c4adf307b63050fb9aee52b07f8 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 28 May 2024 14:37:15 -0700 Subject: [PATCH 68/89] [SCEV] Add tests for symbolic max BTC requiring predicates. Add extra tests for https://github.com/llvm/llvm-project/pull/93498. --- ...cated-symbolic-max-backedge-taken-count.ll | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll new file mode 100644 index 00000000000000..d40416359b65c6 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s + +; %i and %i + 1 can overflow. +define void @test1(i64 %x, ptr %a, ptr %b) { +; CHECK-LABEL: 'test1' +; CHECK-NEXT: Determining loop execution counts for: @test1 +; CHECK-NEXT: Loop %header: Unpredictable backedge-taken count. +; CHECK-NEXT: exit count for header: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: exit count for latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: Loop %header: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: symbolic max exit count for header: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: symbolic max exit count for latch: ***COULDNOTCOMPUTE*** +; +entry: + br label %header + +header: + %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ] + %i.010 = phi i32 [ 0, %entry ], [ %add, %latch ] + %add = add i32 %i.010, 1 + %idxprom = zext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom + %ld = load i32, ptr %arrayidx, align 4 + %uncountable.c = icmp eq i32 %ld, 10 + br i1 %uncountable.c, label %exit, label %latch + +latch: + %add2 = add nsw i32 %ld, 1 + %arrayidx4 = getelementptr inbounds i32, ptr %b, i64 %conv11 + store i32 %add2, ptr %arrayidx4, align 4 + %conv = zext i32 %add to i64 + %cmp = icmp ult i64 %conv, %x + br i1 %cmp, label %header, label %exit + +exit: + ret void +} + +; %i can overflow. +; +; We need to check that i doesn't wrap, but we don't need a run-time alias +; check. We also need an extra no-wrap check to get the backedge taken count. +define void @test2(i64 %x, ptr %a) { +; CHECK-LABEL: 'test2' +; CHECK-NEXT: Determining loop execution counts for: @test2 +; CHECK-NEXT: Loop %header: Unpredictable backedge-taken count. +; CHECK-NEXT: exit count for header: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: exit count for latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: Loop %header: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: symbolic max exit count for header: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: symbolic max exit count for latch: ***COULDNOTCOMPUTE*** +; +entry: + br label %header + +header: + %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ] + %i.010 = phi i32 [ 0, %entry ], [ %inc, %latch ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %conv11 + %ld = load i32, ptr %arrayidx, align 4 + %uncountable.c = icmp eq i32 %ld, 10 + br i1 %uncountable.c, label %exit, label %latch + +latch: + %add = add nsw i32 %ld, 1 + store i32 %add, ptr %arrayidx, align 4 + %inc = add i32 %i.010, 1 + %conv = zext i32 %inc to i64 + %cmp = icmp ult i64 %conv, %x + br i1 %cmp, label %header, label %exit + +exit: + ret void +} From e3f74d4589e29279e9f543b58577a2ece102dc6f Mon Sep 17 00:00:00 2001 From: erichkeane Date: Tue, 28 May 2024 14:25:13 -0700 Subject: [PATCH 69/89] [OpenACC] Correct serialization of certain clause sub-expressions For some reason I was using writeStmtRef when I meant writeStmt, so this corrects that. --- clang/lib/Serialization/ASTWriter.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index dd548fabfd9551..e830c4026ea78f 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7835,7 +7835,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { case OpenACCClauseKind::If: { const auto *IC = cast(C); writeSourceLocation(IC->getLParenLoc()); - writeStmtRef(IC->getConditionExpr()); + AddStmt(const_cast(IC->getConditionExpr())); return; } case OpenACCClauseKind::Self: { @@ -7843,7 +7843,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { writeSourceLocation(SC->getLParenLoc()); writeBool(SC->hasConditionExpr()); if (SC->hasConditionExpr()) - writeStmtRef(SC->getConditionExpr()); + AddStmt(const_cast(SC->getConditionExpr())); return; } case OpenACCClauseKind::NumGangs: { @@ -7857,13 +7857,13 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { case OpenACCClauseKind::NumWorkers: { const auto *NWC = cast(C); writeSourceLocation(NWC->getLParenLoc()); - writeStmtRef(NWC->getIntExpr()); + AddStmt(const_cast(NWC->getIntExpr())); return; } case OpenACCClauseKind::VectorLength: { const auto *NWC = cast(C); writeSourceLocation(NWC->getLParenLoc()); - writeStmtRef(NWC->getIntExpr()); + AddStmt(const_cast(NWC->getIntExpr())); return; } case OpenACCClauseKind::Private: { @@ -7942,15 +7942,15 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { writeSourceLocation(AC->getLParenLoc()); writeBool(AC->hasIntExpr()); if (AC->hasIntExpr()) - writeStmtRef(AC->getIntExpr()); + AddStmt(const_cast(AC->getIntExpr())); return; } case OpenACCClauseKind::Wait: { const auto *WC = cast(C); writeSourceLocation(WC->getLParenLoc()); writeBool(WC->getDevNumExpr()); - if (const Expr *DNE = WC->getDevNumExpr()) - writeStmtRef(DNE); + if (Expr *DNE = WC->getDevNumExpr()) + AddStmt(DNE); writeSourceLocation(WC->getQueuesLoc()); writeOpenACCIntExprList(WC->getQueueIdExprs()); From 060b3023e198d197b47c652f19af5f7dea3a22cc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 14:49:57 -0700 Subject: [PATCH 70/89] [RISCV] Move TRUNCATE_VECTOR_VL combine into a helper function. NFC (#93574) I plan to add other combines on TRUNCATE_VECTOR_VL. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 103 ++++++++++---------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c826892c1668ec..5fc613c1b2a140 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16087,6 +16087,57 @@ static bool matchIndexAsWiderOp(EVT VT, SDValue Index, SDValue Mask, return true; } +static SDValue combineTruncOfSraSext(SDNode *N, SelectionDAG &DAG) { + // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1)) + // This would be benefit for the cases where X and Y are both the same value + // type of low precision vectors. Since the truncate would be lowered into + // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate + // restriction, such pattern would be expanded into a series of "vsetvli" + // and "vnsrl" instructions later to reach this point. + auto IsTruncNode = [](SDValue V) { + if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL) + return false; + SDValue VL = V.getOperand(2); + auto *C = dyn_cast(VL); + // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand + bool IsVLMAXForVMSET = (C && C->isAllOnes()) || + (isa(VL) && + cast(VL)->getReg() == RISCV::X0); + return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL && IsVLMAXForVMSET; + }; + + SDValue Op = N->getOperand(0); + + // We need to first find the inner level of TRUNCATE_VECTOR_VL node + // to distinguish such pattern. + while (IsTruncNode(Op)) { + if (!Op.hasOneUse()) + return SDValue(); + Op = Op.getOperand(0); + } + + if (Op.getOpcode() != ISD::SRA || !Op.hasOneUse()) + return SDValue(); + + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + if (N0.getOpcode() != ISD::SIGN_EXTEND || !N0.hasOneUse() || + N1.getOpcode() != ISD::ZERO_EXTEND || !N1.hasOneUse()) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + if (!N00.getValueType().isVector() || + N00.getValueType() != N10.getValueType() || + N->getValueType(0) != N10.getValueType()) + return SDValue(); + + unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1; + SDValue SMin = + DAG.getNode(ISD::SMIN, SDLoc(N1), N->getValueType(0), N10, + DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0))); + return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin); +} SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -16304,56 +16355,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } } return SDValue(); - case RISCVISD::TRUNCATE_VECTOR_VL: { - // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1)) - // This would be benefit for the cases where X and Y are both the same value - // type of low precision vectors. Since the truncate would be lowered into - // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate - // restriction, such pattern would be expanded into a series of "vsetvli" - // and "vnsrl" instructions later to reach this point. - auto IsTruncNode = [](SDValue V) { - if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL) - return false; - SDValue VL = V.getOperand(2); - auto *C = dyn_cast(VL); - // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand - bool IsVLMAXForVMSET = (C && C->isAllOnes()) || - (isa(VL) && - cast(VL)->getReg() == RISCV::X0); - return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL && - IsVLMAXForVMSET; - }; - - SDValue Op = N->getOperand(0); - - // We need to first find the inner level of TRUNCATE_VECTOR_VL node - // to distinguish such pattern. - while (IsTruncNode(Op)) { - if (!Op.hasOneUse()) - return SDValue(); - Op = Op.getOperand(0); - } - - if (Op.getOpcode() == ISD::SRA && Op.hasOneUse()) { - SDValue N0 = Op.getOperand(0); - SDValue N1 = Op.getOperand(1); - if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && - N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse()) { - SDValue N00 = N0.getOperand(0); - SDValue N10 = N1.getOperand(0); - if (N00.getValueType().isVector() && - N00.getValueType() == N10.getValueType() && - N->getValueType(0) == N10.getValueType()) { - unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1; - SDValue SMin = DAG.getNode( - ISD::SMIN, SDLoc(N1), N->getValueType(0), N10, - DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0))); - return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin); - } - } - } - break; - } + case RISCVISD::TRUNCATE_VECTOR_VL: + return combineTruncOfSraSext(N, DAG); case ISD::TRUNCATE: return performTRUNCATECombine(N, DAG, Subtarget); case ISD::SELECT: From 00bd2fa1982f3114536323209fffad909463effc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 28 May 2024 14:57:13 -0700 Subject: [PATCH 71/89] [flang][cuda] Add bind c to cudadevice procedures (#92822) This patch adds bind c names to functions and subroutines in cudadevice so they can be lowered and not hit the intrinsic procedure TODOs. --- flang/module/cudadevice.f90 | 16 +++++----- flang/test/Lower/CUDA/cuda-device-proc.cuf | 36 ++++++++++++++++++++++ 2 files changed, 44 insertions(+), 8 deletions(-) create mode 100644 flang/test/Lower/CUDA/cuda-device-proc.cuf diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index f34820dd10792a..0224ecfdde7c60 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -18,34 +18,34 @@ module cudadevice ! Synchronization Functions interface - attributes(device) subroutine syncthreads() + attributes(device) subroutine syncthreads() bind(c, name='__syncthreads') end subroutine end interface public :: syncthreads interface - attributes(device) integer function syncthreads_and(value) + attributes(device) integer function syncthreads_and(value) bind(c, name='__syncthreads_and') integer :: value end function end interface public :: syncthreads_and interface - attributes(device) integer function syncthreads_count(value) + attributes(device) integer function syncthreads_count(value) bind(c, name='__syncthreads_count') integer :: value end function end interface public :: syncthreads_count interface - attributes(device) integer function syncthreads_or(value) + attributes(device) integer function syncthreads_or(value) bind(c, name='__syncthreads_or') integer :: value end function end interface public :: syncthreads_or interface - attributes(device) subroutine syncwarp(mask) + attributes(device) subroutine syncwarp(mask) bind(c, name='__syncwarp') integer :: mask end subroutine end interface @@ -54,19 +54,19 @@ attributes(device) subroutine syncwarp(mask) ! Memory Fences interface - attributes(device) subroutine threadfence() + attributes(device) subroutine threadfence() bind(c, name='__threadfence') end subroutine end interface public :: threadfence interface - attributes(device) subroutine threadfence_block() + attributes(device) subroutine threadfence_block() bind(c, name='__threadfence_block') end subroutine end interface public :: threadfence_block interface - attributes(device) subroutine threadfence_system() + attributes(device) subroutine threadfence_system() bind(c, name='__threadfence_system') end subroutine end interface public :: threadfence_system diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf new file mode 100644 index 00000000000000..0c71ea6efcd632 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -0,0 +1,36 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran procedures available in cudadevice module + +attributes(global) subroutine devsub() + implicit none + integer :: ret + + call syncthreads() + call syncwarp(1) + call threadfence() + call threadfence_block() + call threadfence_system() + ret = syncthreads_and(1) + ret = syncthreads_count(1) + ret = syncthreads_or(1) +end + +! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: fir.call @__syncthreads() +! CHECK: fir.call @__syncwarp(%{{.*}}) fastmath : (!fir.ref) -> () +! CHECK: fir.call @__threadfence() +! CHECK: fir.call @__threadfence_block() +! CHECK: fir.call @__threadfence_system() +! CHECK: %{{.*}} = fir.call @__syncthreads_and(%{{.*}}) fastmath : (!fir.ref) -> i32 +! CHECK: %{{.*}} = fir.call @__syncthreads_count(%{{.*}}) fastmath : (!fir.ref) -> i32 +! CHECK: %{{.*}} = fir.call @__syncthreads_or(%{{.*}}) fastmath : (!fir.ref) -> i32 + +! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads"} +! CHECK: func.func private @__syncwarp(!fir.ref {cuf.data_attr = #cuf.cuda}) attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncwarp"} +! CHECK: func.func private @__threadfence() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__threadfence"} +! CHECK: func.func private @__threadfence_block() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__threadfence_block"} +! CHECK: func.func private @__threadfence_system() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__threadfence_system"} +! CHECK: func.func private @__syncthreads_and(!fir.ref {cuf.data_attr = #cuf.cuda}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads_and"} +! CHECK: func.func private @__syncthreads_count(!fir.ref {cuf.data_attr = #cuf.cuda}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads_count"} +! CHECK: func.func private @__syncthreads_or(!fir.ref {cuf.data_attr = #cuf.cuda}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads_or"} From 2d00c6fe06b6d709b4ab3d6b253df304c04e0c1f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 15:05:23 -0700 Subject: [PATCH 72/89] [RISCV] Add a rematerializable pseudo instruction for LUI+ADDI for global addresses. (#93352) This allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization. This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling. This improves the dynamic instruction count on 531.deepsjeng_r from spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a 1% improvement. There are couple regressions, but they are 0.1% or smaller. AArch64 has similar pseudo instructions like MOVaddr --- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 20 ++ .../lib/Target/RISCV/RISCVMergeBaseOffset.cpp | 35 ++- .../RISCV/RISCVPostRAExpandPseudoInsts.cpp | 23 ++ llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll | 22 +- .../CodeGen/RISCV/ctz_zero_return_test.ll | 8 +- .../early-clobber-tied-def-subreg-liveness.ll | 14 +- .../test/CodeGen/RISCV/fold-addi-loadstore.ll | 4 +- llvm/test/CodeGen/RISCV/rv32xtheadbb.ll | 4 +- llvm/test/CodeGen/RISCV/rv32zbb.ll | 4 +- .../CodeGen/RISCV/rvv/active_lane_mask.ll | 40 +-- .../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 4 +- .../rvv/fixed-vectors-interleaved-access.ll | 275 +++++++++--------- .../RISCV/rvv/fixed-vectors-mask-buildvec.ll | 20 +- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 16 +- .../rvv/fixed-vectors-shuffle-reverse.ll | 80 ++--- .../RISCV/rvv/fixed-vectors-stepvector.ll | 10 +- .../test/CodeGen/RISCV/rvv/shuffle-reverse.ll | 50 ++-- llvm/test/CodeGen/RISCV/tail-calls.ll | 8 +- llvm/test/CodeGen/RISCV/unroll-loop-cse.ll | 32 +- 19 files changed, 358 insertions(+), 311 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index ce50fe6e2cbb02..a1b078910e29c9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1311,6 +1311,26 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12), /// HI and ADD_LO address nodes. +// Pseudo for a rematerializable LUI+ADDI sequence for loading an address. +// It will be expanded after register allocation. +// FIXME: The scheduling information does not reflect the multiple instructions. +let Size = 8, isReMaterializable = 1 in +def PseudoMovAddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>, + Sched<[WriteIALU]>; + +def riscv_hi_oneuse : unop_oneuse; +def addr_hi_lo : PatFrag<(ops node:$hi, node:$lo), + (riscv_add_lo (riscv_hi_oneuse node:$hi), node:$lo)>; + +def : Pat<(addr_hi_lo tglobaladdr:$hi, tglobaladdr:$lo), + (PseudoMovAddr tglobaladdr:$hi, tglobaladdr:$lo)>; +def : Pat<(addr_hi_lo tblockaddress:$hi, tblockaddress:$lo), + (PseudoMovAddr tblockaddress:$hi, tblockaddress:$lo)>; +def : Pat<(addr_hi_lo tjumptable:$hi, tjumptable:$lo), + (PseudoMovAddr tjumptable:$hi, tjumptable:$lo)>; +def : Pat<(addr_hi_lo tconstpool:$hi, tconstpool:$lo), + (PseudoMovAddr tconstpool:$hi, tconstpool:$lo)>; + def : Pat<(riscv_hi tglobaladdr:$in), (LUI tglobaladdr:$in)>; def : Pat<(riscv_hi tblockaddress:$in), (LUI tblockaddress:$in)>; def : Pat<(riscv_hi tjumptable:$in), (LUI tjumptable:$in)>; diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp index 410989177a8b9c..fecc83a821f420 100644 --- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -84,7 +84,8 @@ INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE, // 3) The offset value in the Global Address or Constant Pool is 0. bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi, MachineInstr *&Lo) { - if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC) + if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC && + Hi.getOpcode() != RISCV::PseudoMovAddr) return false; const MachineOperand &HiOp1 = Hi.getOperand(1); @@ -97,16 +98,22 @@ bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi, HiOp1.getOffset() != 0) return false; - Register HiDestReg = Hi.getOperand(0).getReg(); - if (!MRI->hasOneUse(HiDestReg)) - return false; + if (Hi.getOpcode() == RISCV::PseudoMovAddr) { + // Most of the code should handle it correctly without modification by + // setting Lo and Hi both point to PseudoMovAddr + Lo = &Hi; + } else { + Register HiDestReg = Hi.getOperand(0).getReg(); + if (!MRI->hasOneUse(HiDestReg)) + return false; - Lo = &*MRI->use_instr_begin(HiDestReg); - if (Lo->getOpcode() != RISCV::ADDI) - return false; + Lo = &*MRI->use_instr_begin(HiDestReg); + if (Lo->getOpcode() != RISCV::ADDI) + return false; + } const MachineOperand &LoOp2 = Lo->getOperand(2); - if (Hi.getOpcode() == RISCV::LUI) { + if (Hi.getOpcode() == RISCV::LUI || Hi.getOpcode() == RISCV::PseudoMovAddr) { if (LoOp2.getTargetFlags() != RISCVII::MO_LO || !(LoOp2.isGlobal() || LoOp2.isCPI() || LoOp2.isBlockAddress()) || LoOp2.getOffset() != 0) @@ -466,6 +473,13 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi, Hi.getOperand(1).setOffset(NewOffset); MachineOperand &ImmOp = Lo.getOperand(2); + // Expand PseudoMovAddr into LUI + if (Hi.getOpcode() == RISCV::PseudoMovAddr) { + auto *TII = ST->getInstrInfo(); + Hi.setDesc(TII->get(RISCV::LUI)); + Hi.removeOperand(2); + } + if (Hi.getOpcode() != RISCV::AUIPC) ImmOp.setOffset(NewOffset); @@ -501,6 +515,11 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi, } } + // Prevent Lo (originally PseudoMovAddr, which is also pointed by Hi) from + // being erased + if (&Lo == &Hi) + return true; + MRI->replaceRegWith(Lo.getOperand(0).getReg(), Hi.getOperand(0).getReg()); Lo.eraseFromParent(); return true; diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp index 52f2ce27164d6e..b7b0c47c084c64 100644 --- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp @@ -44,6 +44,7 @@ class RISCVPostRAExpandPseudo : public MachineFunctionPass { bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); bool expandMovImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool expandMovAddr(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); }; char RISCVPostRAExpandPseudo::ID = 0; @@ -75,6 +76,8 @@ bool RISCVPostRAExpandPseudo::expandMI(MachineBasicBlock &MBB, switch (MBBI->getOpcode()) { case RISCV::PseudoMovImm: return expandMovImm(MBB, MBBI); + case RISCV::PseudoMovAddr: + return expandMovAddr(MBB, MBBI); default: return false; } @@ -101,6 +104,26 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB, return true; } +bool RISCVPostRAExpandPseudo::expandMovAddr(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + DebugLoc DL = MBBI->getDebugLoc(); + + Register DstReg = MBBI->getOperand(0).getReg(); + bool DstIsDead = MBBI->getOperand(0).isDead(); + bool Renamable = MBBI->getOperand(0).isRenamable(); + + BuildMI(MBB, MBBI, DL, TII->get(RISCV::LUI)) + .addReg(DstReg, RegState::Define | getRenamableRegState(Renamable)) + .add(MBBI->getOperand(1)); + BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead) | + getRenamableRegState(Renamable)) + .addReg(DstReg, RegState::Kill | getRenamableRegState(Renamable)) + .add(MBBI->getOperand(2)); + MBBI->eraseFromParent(); + return true; +} + } // end of anonymous namespace INITIALIZE_PASS(RISCVPostRAExpandPseudo, "riscv-expand-pseudolisimm32", diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index 549d531e829ea5..a90c244437a033 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -383,8 +383,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI3_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI3_0) +; RV32I-NEXT: lui s4, %hi(.LCPI3_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0) ; RV32I-NEXT: neg a0, s2 ; RV32I-NEXT: and a0, s2, a0 ; RV32I-NEXT: mv a1, s3 @@ -442,9 +442,9 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32M-LABEL: test_cttz_i64: ; RV32M: # %bb.0: ; RV32M-NEXT: lui a2, 30667 -; RV32M-NEXT: addi a2, a2, 1329 -; RV32M-NEXT: lui a3, %hi(.LCPI3_0) -; RV32M-NEXT: addi a3, a3, %lo(.LCPI3_0) +; RV32M-NEXT: addi a3, a2, 1329 +; RV32M-NEXT: lui a2, %hi(.LCPI3_0) +; RV32M-NEXT: addi a2, a2, %lo(.LCPI3_0) ; RV32M-NEXT: bnez a1, .LBB3_3 ; RV32M-NEXT: # %bb.1: ; RV32M-NEXT: li a1, 32 @@ -452,18 +452,18 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32M-NEXT: .LBB3_2: ; RV32M-NEXT: neg a1, a0 ; RV32M-NEXT: and a0, a0, a1 -; RV32M-NEXT: mul a0, a0, a2 +; RV32M-NEXT: mul a0, a0, a3 ; RV32M-NEXT: srli a0, a0, 27 -; RV32M-NEXT: add a0, a3, a0 +; RV32M-NEXT: add a0, a2, a0 ; RV32M-NEXT: lbu a0, 0(a0) ; RV32M-NEXT: li a1, 0 ; RV32M-NEXT: ret ; RV32M-NEXT: .LBB3_3: ; RV32M-NEXT: neg a4, a1 ; RV32M-NEXT: and a1, a1, a4 -; RV32M-NEXT: mul a1, a1, a2 +; RV32M-NEXT: mul a1, a1, a3 ; RV32M-NEXT: srli a1, a1, 27 -; RV32M-NEXT: add a1, a3, a1 +; RV32M-NEXT: add a1, a2, a1 ; RV32M-NEXT: lbu a1, 0(a1) ; RV32M-NEXT: bnez a0, .LBB3_2 ; RV32M-NEXT: .LBB3_4: @@ -814,8 +814,8 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI7_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI7_0) +; RV32I-NEXT: lui s4, %hi(.LCPI7_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI7_0) ; RV32I-NEXT: neg a0, s1 ; RV32I-NEXT: and a0, s1, a0 ; RV32I-NEXT: mv a1, s3 diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll index 9ae30e646fdbf7..fe6e20d852d590 100644 --- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll +++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll @@ -48,8 +48,8 @@ define signext i32 @ctz_dereferencing_pointer(ptr %b) nounwind { ; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI0_0) -; RV32I-NEXT: addi s3, a0, %lo(.LCPI0_0) +; RV32I-NEXT: lui s3, %hi(.LCPI0_0) +; RV32I-NEXT: addi s3, s3, %lo(.LCPI0_0) ; RV32I-NEXT: neg a0, s4 ; RV32I-NEXT: and a0, s4, a0 ; RV32I-NEXT: mv a1, s1 @@ -511,8 +511,8 @@ define signext i32 @ctz4(i64 %b) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI6_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI6_0) +; RV32I-NEXT: lui s4, %hi(.LCPI6_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI6_0) ; RV32I-NEXT: neg a0, s2 ; RV32I-NEXT: and a0, s2, a0 ; RV32I-NEXT: mv a1, s3 diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll index eb6ac985287a10..478d2eae9dca2c 100644 --- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll +++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll @@ -24,31 +24,31 @@ define void @_Z3foov() { ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_49) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_49) ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_48) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_48) -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_46) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_46) -; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_45) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_45) -; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v12, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v14, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vs2r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_40) diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll index 3c2e84689c979c..62b1549a5d58ad 100644 --- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll +++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll @@ -389,8 +389,8 @@ define dso_local i32 @load_ga() local_unnamed_addr #0 { define dso_local i64 @load_ga_8() nounwind { ; RV32I-LABEL: load_ga_8: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lui a0, %hi(ga_8) -; RV32I-NEXT: addi a1, a0, %lo(ga_8) +; RV32I-NEXT: lui a1, %hi(ga_8) +; RV32I-NEXT: addi a1, a1, %lo(ga_8) ; RV32I-NEXT: lw a0, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll index b45ab135fa1c7c..197366e7e05fe8 100644 --- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll @@ -209,8 +209,8 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI3_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI3_0) +; RV32I-NEXT: lui s4, %hi(.LCPI3_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0) ; RV32I-NEXT: neg a0, s2 ; RV32I-NEXT: and a0, s2, a0 ; RV32I-NEXT: mv a1, s3 diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 7e6c3f9c87d277..f25aa0de89da88 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -199,8 +199,8 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, %hi(.LCPI3_0) -; RV32I-NEXT: addi s4, a0, %lo(.LCPI3_0) +; RV32I-NEXT: lui s4, %hi(.LCPI3_0) +; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0) ; RV32I-NEXT: neg a0, s2 ; RV32I-NEXT: and a0, s2, a0 ; RV32I-NEXT: mv a1, s3 diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll index 9cb3991f31f94d..08b310213d16e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -126,28 +126,28 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI9_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI9_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1) -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v24, v16 +; CHECK-NEXT: vsaddu.vx v16, v24, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 ; CHECK-NEXT: vsext.vf8 v16, v8 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v8, v16, a2 -; CHECK-NEXT: vsext.vf8 v16, v9 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 ; CHECK-NEXT: lui a0, %hi(.LCPI9_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmsltu.vx v10, v16, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v8, 2 +; CHECK-NEXT: vslideup.vi v0, v9, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v10, 4 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsext.vf8 v16, v8 ; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma @@ -169,13 +169,13 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vsext.vf8 v16, v8 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v10, v16, a2 +; CHECK-NEXT: vmsltu.vx v8, v16, a2 ; CHECK-NEXT: vsext.vf8 v16, v9 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v8, v16, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) ; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vmsltu.vx v10, v16, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_3) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) ; CHECK-NEXT: vle8.v v11, (a0) @@ -187,10 +187,10 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vmsltu.vx v11, v16, a2 ; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_4) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) ; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_5) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) ; CHECK-NEXT: vle8.v v13, (a0) @@ -201,27 +201,27 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vsaddu.vx v16, v16, a1 ; CHECK-NEXT: vmsltu.vx v13, v16, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vslideup.vi v10, v8, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vslideup.vi v10, v9, 4 ; CHECK-NEXT: lui a0, %hi(.LCPI10_6) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v11, 6 +; CHECK-NEXT: vslideup.vi v10, v11, 6 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v12, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v13, 4 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsext.vf8 v16, v8 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vmsltu.vx v8, v16, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v0, v9, 6 +; CHECK-NEXT: vslideup.vi v0, v8, 6 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vi v0, v8, 8 +; CHECK-NEXT: vslideup.vi v0, v10, 8 ; CHECK-NEXT: ret %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc) ret <128 x i1> %mask diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 79c36a629465d9..f4d7074c7f6b27 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -3459,6 +3459,8 @@ define void @mulhu_v4i64(ptr %x) { ; RV64-NEXT: lui a1, %hi(.LCPI184_0) ; RV64-NEXT: addi a1, a1, %lo(.LCPI184_0) ; RV64-NEXT: vle64.v v10, (a1) +; RV64-NEXT: vmulhu.vv v10, v8, v10 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: li a1, -1 ; RV64-NEXT: slli a1, a1, 63 ; RV64-NEXT: vmv.s.x v12, a1 @@ -3466,8 +3468,6 @@ define void @mulhu_v4i64(ptr %x) { ; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma ; RV64-NEXT: vslideup.vi v14, v12, 2 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmulhu.vv v10, v8, v10 -; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: vmulhu.vv v8, v8, v14 ; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a1, 12320 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 178a920169ad96..bc3e135a588a6f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -159,17 +159,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 82 +; RV32-NEXT: li a3, 80 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 82 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb ; RV32-NEXT: addi a3, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a3) ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 57 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 6 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill @@ -177,26 +176,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vslideup.vi v8, v16, 4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 41 +; RV32-NEXT: li a5, 40 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 12 -; RV32-NEXT: vmv.s.x v1, a4 +; RV32-NEXT: vmv.s.x v0, a4 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a5, a4, 6 -; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: li a5, 56 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vmv1r.v v3, v0 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vslideup.vi v8, v16, 10, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 45 +; RV32-NEXT: li a5, 44 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -206,8 +205,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: vle16.v v8, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a5, a4, 5 -; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: slli a4, a4, 5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill @@ -216,21 +214,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a5, 1 ; RV32-NEXT: vle16.v v8, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a6, 25 +; RV32-NEXT: li a6, 24 ; RV32-NEXT: mul a4, a4, a6 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 73 +; RV32-NEXT: li a4, 72 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v24, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -238,27 +236,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a5, -64 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v16, v8, v4 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 44 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -266,259 +263,257 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 44 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vslideup.vi v12, v8, 2 +; RV32-NEXT: vmv1r.v v8, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v1, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vs1r.v v3, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vslideup.vi v12, v16, 8, v0.t -; RV32-NEXT: vmv.v.v v20, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_2) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) ; RV32-NEXT: lui a3, %hi(.LCPI6_3) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_3) -; RV32-NEXT: lui a4, %hi(.LCPI6_4) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: vle16.v v16, (a3) -; RV32-NEXT: addi a1, a4, %lo(.LCPI6_4) +; RV32-NEXT: vle16.v v0, (a1) +; RV32-NEXT: vle16.v v4, (a3) +; RV32-NEXT: lui a1, %hi(.LCPI6_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v2, (a1) +; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v24, v8, v4 +; RV32-NEXT: vrgatherei16.vv v24, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v16, v4, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v24 +; RV32-NEXT: vmv.v.v v12, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v16, v24, v2 -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vrgatherei16.vv v12, v24, v10 +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v16, v8, 6, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v12, v24, 6, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_5) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) ; RV32-NEXT: lui a3, %hi(.LCPI6_6) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_6) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vle16.v v4, (a3) -; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 +; RV32-NEXT: li a3, 12 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 960 +; RV32-NEXT: vmv.s.x v8, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v16 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v0, v12 +; RV32-NEXT: vmv1r.v v3, v8 +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 12 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) ; RV32-NEXT: lui a3, %hi(.LCPI6_8) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_8) -; RV32-NEXT: lui a4, %hi(.LCPI6_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: addi a1, a4, %lo(.LCPI6_9) +; RV32-NEXT: lui a1, %hi(.LCPI6_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_9) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v24, (a3) -; RV32-NEXT: vle16.v v28, (a1) +; RV32-NEXT: vle16.v v4, (a3) +; RV32-NEXT: vle16.v v12, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v4, v0, v8 +; RV32-NEXT: vrgatherei16.vv v12, v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v4, v8, 4, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv4r.v v24, v16 +; RV32-NEXT: vslideup.vi v12, v16, 4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 +; RV32-NEXT: li a3, 12 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: vrgatherei16.vv v8, v16, v4 +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_10) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_10) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vle16.v v12, (a1) ; RV32-NEXT: lui a1, 15 ; RV32-NEXT: vmv.s.x v3, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v12, v16, 6 +; RV32-NEXT: vslideup.vi v8, v16, 6 ; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vrgatherei16.vv v8, v24, v12, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_11) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_11) ; RV32-NEXT: lui a3, %hi(.LCPI6_12) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_12) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: vle16.v v12, (a3) +; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v4, (a3) ; RV32-NEXT: li a1, 1008 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v8 +; RV32-NEXT: vrgatherei16.vv v8, v16, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v12, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 2 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_13) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_13) ; RV32-NEXT: lui a3, %hi(.LCPI6_14) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_14) -; RV32-NEXT: lui a4, %hi(.LCPI6_15) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a1) -; RV32-NEXT: addi a1, a4, %lo(.LCPI6_15) +; RV32-NEXT: lui a1, %hi(.LCPI6_15) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_15) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; RV32-NEXT: vle16.v v24, (a3) ; RV32-NEXT: vle16.v v8, (a1) @@ -526,27 +521,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vrgatherei16.vv v16, v8, v20, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -554,7 +548,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -562,12 +556,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: vrgatherei16.vv v8, v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 49 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -576,31 +570,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 21 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 13 +; RV32-NEXT: li a2, 12 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 57 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 2 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vmv.v.v v28, v0 ; RV32-NEXT: vmv.v.v v16, v8 ; RV32-NEXT: addi a1, a0, 320 @@ -614,21 +605,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 45 +; RV32-NEXT: li a2, 44 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 82 +; RV32-NEXT: li a1, 80 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll index 17483151869365..7608349ef7aeff 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll @@ -549,20 +549,20 @@ define <128 x i1> @buildvec_mask_v128i1() { define <128 x i1> @buildvec_mask_optsize_v128i1() optsize { ; CHECK-LABEL: buildvec_mask_optsize_v128i1: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI21_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI21_0) -; CHECK-NEXT: li a1, 128 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: li a0, 128 +; CHECK-NEXT: lui a1, %hi(.LCPI21_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI21_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_optsize_v128i1: ; ZVE32F: # %bb.0: -; ZVE32F-NEXT: lui a0, %hi(.LCPI21_0) -; ZVE32F-NEXT: addi a0, a0, %lo(.LCPI21_0) -; ZVE32F-NEXT: li a1, 128 -; ZVE32F-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; ZVE32F-NEXT: vlm.v v0, (a0) +; ZVE32F-NEXT: li a0, 128 +; ZVE32F-NEXT: lui a1, %hi(.LCPI21_0) +; ZVE32F-NEXT: addi a1, a1, %lo(.LCPI21_0) +; ZVE32F-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; ZVE32F-NEXT: vlm.v v0, (a1) ; ZVE32F-NEXT: ret ret <128 x i1> } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index db0969c85a8e24..69341981288b91 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -13327,22 +13327,22 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { ; RV32-LABEL: mgather_shuffle_vrgather: ; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI119_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI119_0) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v9, (a0) -; RV32-NEXT: lui a0, %hi(.LCPI119_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI119_0) +; RV32-NEXT: vle16.v v9, (a1) ; RV32-NEXT: vle16.v v10, (a0) -; RV32-NEXT: vrgather.vv v8, v9, v10 +; RV32-NEXT: vrgather.vv v8, v10, v9 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_shuffle_vrgather: ; RV64V: # %bb.0: +; RV64V-NEXT: lui a1, %hi(.LCPI119_0) +; RV64V-NEXT: addi a1, a1, %lo(.LCPI119_0) ; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vle16.v v9, (a0) -; RV64V-NEXT: lui a0, %hi(.LCPI119_0) -; RV64V-NEXT: addi a0, a0, %lo(.LCPI119_0) +; RV64V-NEXT: vle16.v v9, (a1) ; RV64V-NEXT: vle16.v v10, (a0) -; RV64V-NEXT: vrgather.vv v8, v9, v10 +; RV64V-NEXT: vrgather.vv v8, v10, v9 ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_shuffle_vrgather: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll index d70ed2fb0e2665..4b1f0beb487008 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -228,11 +228,11 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) { define <32 x i8> @reverse_v32i8(<32 x i8> %a) { ; CHECK-LABEL: reverse_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI12_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI12_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI12_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -243,11 +243,11 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) { define <64 x i8> @reverse_v64i8(<64 x i8> %a) { ; CHECK-LABEL: reverse_v64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI13_0) -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: lui a1, %hi(.LCPI13_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI13_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -323,11 +323,11 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) { define <32 x i16> @reverse_v32i16(<32 x i16> %a) { ; CHECK-LABEL: reverse_v32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI19_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI19_0) +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vsext.vf2 v16, v12 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 @@ -520,11 +520,11 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) { define <32 x half> @reverse_v32f16(<32 x half> %a) { ; CHECK-LABEL: reverse_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI34_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI34_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI34_0) +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vsext.vf2 v16, v12 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 @@ -820,33 +820,33 @@ define <6 x i64> @reverse_v6i64(<6 x i64> %a) { define <12 x i64> @reverse_v12i64(<12 x i64> %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_v12i64: ; RV32-BITS-UNKNOWN: # %bb.0: -; RV32-BITS-UNKNOWN-NEXT: lui a0, %hi(.LCPI46_0) -; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, %lo(.LCPI46_0) -; RV32-BITS-UNKNOWN-NEXT: li a1, 32 -; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vle16.v v24, (a0) +; RV32-BITS-UNKNOWN-NEXT: li a0, 32 +; RV32-BITS-UNKNOWN-NEXT: lui a1, %hi(.LCPI46_0) +; RV32-BITS-UNKNOWN-NEXT: addi a1, a1, %lo(.LCPI46_0) +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vle16.v v24, (a1) ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v16, v8, v24 ; RV32-BITS-UNKNOWN-NEXT: vmv.v.v v8, v16 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_v12i64: ; RV32-BITS-256: # %bb.0: -; RV32-BITS-256-NEXT: lui a0, %hi(.LCPI46_0) -; RV32-BITS-256-NEXT: addi a0, a0, %lo(.LCPI46_0) -; RV32-BITS-256-NEXT: li a1, 32 -; RV32-BITS-256-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-BITS-256-NEXT: vle16.v v24, (a0) +; RV32-BITS-256-NEXT: li a0, 32 +; RV32-BITS-256-NEXT: lui a1, %hi(.LCPI46_0) +; RV32-BITS-256-NEXT: addi a1, a1, %lo(.LCPI46_0) +; RV32-BITS-256-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-BITS-256-NEXT: vle16.v v24, (a1) ; RV32-BITS-256-NEXT: vrgatherei16.vv v16, v8, v24 ; RV32-BITS-256-NEXT: vmv.v.v v8, v16 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_v12i64: ; RV32-BITS-512: # %bb.0: -; RV32-BITS-512-NEXT: lui a0, %hi(.LCPI46_0) -; RV32-BITS-512-NEXT: addi a0, a0, %lo(.LCPI46_0) -; RV32-BITS-512-NEXT: li a1, 32 -; RV32-BITS-512-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-BITS-512-NEXT: vle16.v v24, (a0) +; RV32-BITS-512-NEXT: li a0, 32 +; RV32-BITS-512-NEXT: lui a1, %hi(.LCPI46_0) +; RV32-BITS-512-NEXT: addi a1, a1, %lo(.LCPI46_0) +; RV32-BITS-512-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-BITS-512-NEXT: vle16.v v24, (a1) ; RV32-BITS-512-NEXT: vrgatherei16.vv v16, v8, v24 ; RV32-BITS-512-NEXT: vmv.v.v v8, v16 ; RV32-BITS-512-NEXT: ret @@ -883,11 +883,11 @@ define <12 x i64> @reverse_v12i64(<12 x i64> %a) { ; ; RV32-ZVBB-LABEL: reverse_v12i64: ; RV32-ZVBB: # %bb.0: -; RV32-ZVBB-NEXT: lui a0, %hi(.LCPI46_0) -; RV32-ZVBB-NEXT: addi a0, a0, %lo(.LCPI46_0) -; RV32-ZVBB-NEXT: li a1, 32 -; RV32-ZVBB-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-ZVBB-NEXT: vle16.v v24, (a0) +; RV32-ZVBB-NEXT: li a0, 32 +; RV32-ZVBB-NEXT: lui a1, %hi(.LCPI46_0) +; RV32-ZVBB-NEXT: addi a1, a1, %lo(.LCPI46_0) +; RV32-ZVBB-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-ZVBB-NEXT: vle16.v v24, (a1) ; RV32-ZVBB-NEXT: vrgatherei16.vv v16, v8, v24 ; RV32-ZVBB-NEXT: vmv.v.v v8, v16 ; RV32-ZVBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll index 0161ac4bc338db..e2580c132f65e9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll @@ -225,11 +225,11 @@ declare <16 x i64> @llvm.experimental.stepvector.v16i64() define <16 x i64> @stepvector_v16i64() { ; RV32-LABEL: stepvector_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI16_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI16_0) -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vle8.v v16, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: lui a1, %hi(.LCPI16_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI16_0) +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vle8.v v16, (a1) ; RV32-NEXT: vsext.vf4 v8, v16 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll index 6e327457bebffc..368f454fa5fda1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll @@ -106,11 +106,11 @@ define <16 x i8> @v16i8(<16 x i8> %a) { define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v16i8_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI7_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI7_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI7_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vmv1r.v v14, v9 ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vid.v v8 @@ -230,11 +230,11 @@ define <16 x i16> @v16i16(<16 x i16> %a) { define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: v16i16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI15_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI15_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI15_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI15_0) +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: vmv2r.v v20, v10 ; CHECK-NEXT: vmv2r.v v12, v8 ; CHECK-NEXT: vrgather.vv v8, v12, v16 @@ -363,11 +363,11 @@ define <16 x i32> @v16i32(<16 x i32> %a) { define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: v16i32_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI23_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI23_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle16.v v20, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI23_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI23_0) +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vle16.v v20, (a1) ; CHECK-NEXT: vmv4r.v v24, v12 ; CHECK-NEXT: vmv4r.v v16, v8 ; CHECK-NEXT: vrgatherei16.vv v8, v16, v20 @@ -548,11 +548,11 @@ define <16 x half> @v16f16(<16 x half> %a) { define <32 x half> @v16f16_2(<16 x half> %a) { ; CHECK-LABEL: v16f16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI35_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI35_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI35_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI35_0) +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -719,11 +719,11 @@ define <8 x double> @v4f64_2(<4 x double> %a, <4 x double> %b) { define <32 x i8> @v32i8(<32 x i8> %a) { ; CHECK-LABEL: v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI46_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI46_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI46_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: vrgather.vv v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll index 87d69bfad38c2b..d3e495bb723ad8 100644 --- a/llvm/test/CodeGen/RISCV/tail-calls.ll +++ b/llvm/test/CodeGen/RISCV/tail-calls.ll @@ -56,12 +56,12 @@ define void @caller_indirect_tail(i32 %a) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: beqz a0, .LBB3_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: lui a0, %hi(callee_indirect2) -; CHECK-NEXT: addi t1, a0, %lo(callee_indirect2) +; CHECK-NEXT: lui t1, %hi(callee_indirect2) +; CHECK-NEXT: addi t1, t1, %lo(callee_indirect2) ; CHECK-NEXT: jr t1 ; CHECK-NEXT: .LBB3_2: -; CHECK-NEXT: lui a0, %hi(callee_indirect1) -; CHECK-NEXT: addi t1, a0, %lo(callee_indirect1) +; CHECK-NEXT: lui t1, %hi(callee_indirect1) +; CHECK-NEXT: addi t1, t1, %lo(callee_indirect1) ; CHECK-NEXT: jr t1 diff --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll index 2fd4572d234567..65307363048376 100644 --- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll +++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll @@ -10,36 +10,30 @@ define signext i32 @unroll_loop_cse() { ; CHECK-LABEL: unroll_loop_cse: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(x) -; CHECK-NEXT: lw a3, %lo(x)(a1) -; CHECK-NEXT: lui a2, %hi(check) -; CHECK-NEXT: lw a4, %lo(check)(a2) +; CHECK-NEXT: lui a0, %hi(x) +; CHECK-NEXT: lw a1, %lo(x)(a0) +; CHECK-NEXT: lui a0, %hi(check) +; CHECK-NEXT: lw a2, %lo(check)(a0) ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: bne a3, a4, .LBB0_6 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi a1, a1, %lo(x) -; CHECK-NEXT: lw a1, 4(a1) -; CHECK-NEXT: addi a2, a2, %lo(check) -; CHECK-NEXT: lw a2, 4(a2) ; CHECK-NEXT: bne a1, a2, .LBB0_6 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a1, %hi(x) ; CHECK-NEXT: addi a1, a1, %lo(x) -; CHECK-NEXT: lw a3, 8(a1) +; CHECK-NEXT: lw a3, 4(a1) ; CHECK-NEXT: lui a2, %hi(check) ; CHECK-NEXT: addi a2, a2, %lo(check) +; CHECK-NEXT: lw a4, 4(a2) +; CHECK-NEXT: bne a3, a4, .LBB0_6 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: lw a3, 8(a1) ; CHECK-NEXT: lw a4, 8(a2) ; CHECK-NEXT: bne a3, a4, .LBB0_6 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: lw a1, 12(a1) -; CHECK-NEXT: lw a2, 12(a2) -; CHECK-NEXT: bne a1, a2, .LBB0_6 +; CHECK-NEXT: lw a3, 12(a1) +; CHECK-NEXT: lw a4, 12(a2) +; CHECK-NEXT: bne a3, a4, .LBB0_6 ; CHECK-NEXT: # %bb.4: -; CHECK-NEXT: lui a1, %hi(x) -; CHECK-NEXT: addi a1, a1, %lo(x) ; CHECK-NEXT: lw a3, 16(a1) -; CHECK-NEXT: lui a2, %hi(check) -; CHECK-NEXT: addi a2, a2, %lo(check) ; CHECK-NEXT: lw a4, 16(a2) ; CHECK-NEXT: bne a3, a4, .LBB0_6 ; CHECK-NEXT: # %bb.5: From 765206e050453018e861637a08a4520f29238074 Mon Sep 17 00:00:00 2001 From: gulfemsavrun Date: Tue, 28 May 2024 15:06:11 -0700 Subject: [PATCH 73/89] [CodeGen] Hidden visibility for prof version var (#93496) This patch adds hidden visibility to the variable that is used by the single byte counters mode in source-based code coverage. --- clang/lib/CodeGen/CodeGenPGO.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 76704c4d7be4a4..db8e6f55302adc 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1340,7 +1340,7 @@ void CodeGenPGO::setProfileVersion(llvm::Module &M) { llvm::APInt(64, ProfileVersion)), VarName); - IRLevelVersionVariable->setVisibility(llvm::GlobalValue::DefaultVisibility); + IRLevelVersionVariable->setVisibility(llvm::GlobalValue::HiddenVisibility); llvm::Triple TT(M.getTargetTriple()); if (TT.supportsCOMDAT()) { IRLevelVersionVariable->setLinkage(llvm::GlobalValue::ExternalLinkage); From 067b4ccb4b5ab93ac2dc2243248a8934fa1f7ce3 Mon Sep 17 00:00:00 2001 From: Eric Date: Tue, 28 May 2024 15:19:04 -0700 Subject: [PATCH 74/89] Upstream libc++ buildbot restarter. (#93582) I've been running a cronjob on my local machine to restart preempted libc++ CI runs. This is bad and brittle. This upstreams a much better version of the restarter. It works by matching on check run annotations looking for mention of the machine being shutdown. If there are both preempted jobs and failing jobs, we don't restart the workflow. Maybe we should change that? --- .../restart-preempted-libcxx-jobs.yaml | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 .github/workflows/restart-preempted-libcxx-jobs.yaml diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml new file mode 100644 index 00000000000000..a71f2084182e5e --- /dev/null +++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml @@ -0,0 +1,109 @@ +name: Restart Preempted Libc++ Workflow + +# The libc++ builders run on preemptable VMs, which can be shutdown at any time. +# This workflow identifies when a workflow run was canceled due to the VM being preempted, +# and restarts the workflow run. + +# We identify a canceled workflow run by checking the annotations of the check runs in the check suite, +# which should contain the message "The runner has received a shutdown signal." + +# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow. + +on: + workflow_run: + workflows: + - "Build and Test libc\+\+" + types: + - failure + - canceled + +permissions: + contents: read + +jobs: + restart: + if: github.repository_owner == 'llvm' + name: "Restart Job" + permissions: + statuses: read + checks: read + actions: write + runs-on: ubuntu-latest + steps: + - name: "Restart Job" + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1 + with: + script: | + const failure_regex = /Process completed with exit code 1./ + const preemption_regex = /The runner has received a shutdown signal/ + + console.log('Listing check runs for suite') + const check_suites = await github.rest.checks.listForSuite({ + owner: context.repo.owner, + repo: context.repo.repo, + check_suite_id: context.payload.workflow_run.check_suite_id + }) + + check_run_ids = []; + for (check_run of check_suites.data.check_runs) { + console.log('Checking check run: ' + check_run.id); + console.log(check_run); + if (check_run.status != 'completed') { + console.log('Check run was not completed. Skipping.'); + continue; + } + if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') { + console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.'); + continue; + } + check_run_ids.push(check_run.id); + } + + has_preempted_job = false; + + for (check_run_id of check_run_ids) { + console.log('Listing annotations for check run: ' + check_run_id); + + annotations = await github.rest.checks.listAnnotations({ + owner: context.repo.owner, + repo: context.repo.repo, + check_run_id: check_run_id + }) + + console.log(annotations); + for (annotation of annotations.data) { + if (annotation.annotation_level != 'failure') { + continue; + } + + const preemption_match = annotation.message.match(preemption_regex); + + if (preemption_match != null) { + console.log('Found preemption message: ' + annotation.message); + has_preempted_job = true; + } + + const failure_match = annotation.message.match(failure_regex); + if (failure_match != null) { + // We only want to restart the workflow if all of the failures were due to preemption. + // We don't want to restart the workflow if there were other failures. + console.log('Choosing not to rerun workflow because we found a non-preemption failure'); + console.log('Failure message: ' + annotation.message); + return; + } + } + } + + if (!has_preempted_job) { + console.log('No preempted jobs found. Not restarting workflow.'); + return; + } + + console.log("Restarted workflow: " + context.payload.workflow_run.id); + await github.rest.actions.reRunWorkflowFailedJobs({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: context.payload.workflow_run.id + }) + + From b9cdea66b62e2eb91814ef7c57ea01aa27440e72 Mon Sep 17 00:00:00 2001 From: Eric Fiselier Date: Tue, 28 May 2024 18:23:14 -0400 Subject: [PATCH 75/89] Attempt to fix issue with plus sign in libc++ workflow name --- .github/workflows/restart-preempted-libcxx-jobs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml index a71f2084182e5e..5682b0a4f52c3d 100644 --- a/.github/workflows/restart-preempted-libcxx-jobs.yaml +++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml @@ -12,7 +12,7 @@ name: Restart Preempted Libc++ Workflow on: workflow_run: workflows: - - "Build and Test libc\+\+" + - Build and Test libc\+\+ types: - failure - canceled From 6aeea700df6f3f8db9e6a79be4aa593c6fcc7d18 Mon Sep 17 00:00:00 2001 From: Spenser Bauman Date: Tue, 28 May 2024 18:29:17 -0400 Subject: [PATCH 76/89] [mlir][dataflow] Fix for integer range analysis propagation bug (#93199) Integer range analysis will not update the range of an operation when any of the inferred input lattices are uninitialized. In the current behavior, all lattice values for non integer types are uninitialized. For operations like arith.cmpf ```mlir %3 = arith.cmpf ugt, %arg0, %arg1 : f32 ``` that will result in the range of the output also being uninitialized, and so on for any consumer of the arith.cmpf result. When control-flow ops are involved, the lack of propagation results in incorrect ranges, as the back edges for loop carried values are not properly joined with the definitions from the body region. For example, an scf.while loop whose body region produces a value that is in a dataflow relationship with some floating-point values through an arith.cmpf operation: ```mlir func.func @test_bad_range(%arg0: f32, %arg1: f32) -> (index, index) { %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %3 = arith.cmpf ugt, %arg0, %arg1 : f32 %1:2 = scf.while (%arg2 = %c0, %arg3 = %c0) : (index, index) -> (index, index) { %2 = arith.cmpi ult, %arg2, %c4 : index scf.condition(%2) %arg2, %arg3 : index, index } do { ^bb0(%arg2: index, %arg3: index): %4 = arith.select %3, %arg3, %arg3 : index %5 = arith.addi %arg2, %c1 : index scf.yield %5, %4 : index, index } return %1#0, %1#1 : index, index } ``` The existing behavior results in the control condition %2 being optimized to true, turning the while loop into an infinite loop. The update to %arg2 through the body region is never factored into the range calculation, as the ranges for the body ops all test as uninitialized. This change causes all values initialized with setToEntryState to be set to some initialized range, even if the values are not integers. --------- Co-authored-by: Spenser Bauman --- .../Analysis/DataFlow/IntegerRangeAnalysis.h | 45 ----------- .../include/mlir/Dialect/Arith/IR/ArithOps.td | 16 ++-- mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 12 +-- .../include/mlir/Dialect/Index/IR/IndexOps.td | 2 +- .../mlir/Interfaces/InferIntRangeInterface.h | 75 ++++++++++++++++++- .../mlir/Interfaces/InferIntRangeInterface.td | 46 +++++++++--- .../Interfaces/Utils/InferIntRangeCommon.h | 8 +- .../DataFlow/IntegerRangeAnalysis.cpp | 51 ++++--------- .../Arith/IR/InferIntRangeInterfaceImpls.cpp | 18 +++-- .../lib/Interfaces/InferIntRangeInterface.cpp | 48 ++++++++++++ .../Interfaces/Utils/InferIntRangeCommon.cpp | 2 +- .../Dialect/Arith/int-range-interface.mlir | 19 +++++ mlir/test/lib/Dialect/Test/TestOps.td | 9 ++- 13 files changed, 230 insertions(+), 121 deletions(-) diff --git a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h index 8bd7cf880c6afb..191c023fb642cb 100644 --- a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h @@ -24,51 +24,6 @@ namespace mlir { namespace dataflow { -/// This lattice value represents the integer range of an SSA value. -class IntegerValueRange { -public: - /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)]) - /// range that is used to mark the value as unable to be analyzed further, - /// where `t` is the type of `value`. - static IntegerValueRange getMaxRange(Value value); - - /// Create an integer value range lattice value. - IntegerValueRange(std::optional value = std::nullopt) - : value(std::move(value)) {} - - /// Whether the range is uninitialized. This happens when the state hasn't - /// been set during the analysis. - bool isUninitialized() const { return !value.has_value(); } - - /// Get the known integer value range. - const ConstantIntRanges &getValue() const { - assert(!isUninitialized()); - return *value; - } - - /// Compare two ranges. - bool operator==(const IntegerValueRange &rhs) const { - return value == rhs.value; - } - - /// Take the union of two ranges. - static IntegerValueRange join(const IntegerValueRange &lhs, - const IntegerValueRange &rhs) { - if (lhs.isUninitialized()) - return rhs; - if (rhs.isUninitialized()) - return lhs; - return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())}; - } - - /// Print the integer value range. - void print(raw_ostream &os) const { os << value; } - -private: - /// The known integer value range. - std::optional value; -}; - /// This lattice element represents the integer value range of an SSA value. /// When this lattice is updated, it automatically updates the constant value /// of the SSA value (if the range can be narrowed to one). diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td index ead52332e8eec3..46248dad3be9e0 100644 --- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td +++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td @@ -49,7 +49,7 @@ class Arith_BinaryOp traits = []> : // Base class for integer binary operations. class Arith_IntBinaryOp traits = []> : Arith_BinaryOp]>, + [DeclareOpInterfaceMethods]>, Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs)>, Results<(outs SignlessIntegerLike:$result)>; @@ -107,7 +107,7 @@ class Arith_IToICastOp traits = []> : Arith_CastOp]>; + [DeclareOpInterfaceMethods]>; // Cast from an integer type to a floating point type. class Arith_IToFCastOp traits = []> : Arith_CastOp; @@ -139,7 +139,7 @@ class Arith_CompareOpOfAnyRank traits = []> : class Arith_IntBinaryOpWithOverflowFlags traits = []> : Arith_BinaryOp, + [Pure, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]>, Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs, DefaultValuedAttr< @@ -159,7 +159,7 @@ def Arith_ConstantOp : Op, AllTypesMatch<["value", "result"]>, - DeclareOpInterfaceMethods]> { + DeclareOpInterfaceMethods]> { let summary = "integer or floating point constant"; let description = [{ The `constant` operation produces an SSA value equal to some integer or @@ -1327,7 +1327,7 @@ def IndexCastTypeConstraint : TypeConstraint]> { + [DeclareOpInterfaceMethods]> { let summary = "cast between index and integer types"; let description = [{ Casts between scalar or vector integers and corresponding 'index' scalar or @@ -1346,7 +1346,7 @@ def Arith_IndexCastOp def Arith_IndexCastUIOp : Arith_CastOp<"index_castui", IndexCastTypeConstraint, IndexCastTypeConstraint, - [DeclareOpInterfaceMethods]> { + [DeclareOpInterfaceMethods]> { let summary = "unsigned cast between index and integer types"; let description = [{ Casts between scalar or vector integers and corresponding 'index' scalar or @@ -1400,7 +1400,7 @@ def Arith_BitcastOp : Arith_CastOp<"bitcast", BitcastTypeConstraint, def Arith_CmpIOp : Arith_CompareOpOfAnyRank<"cmpi", - [DeclareOpInterfaceMethods]> { + [DeclareOpInterfaceMethods]> { let summary = "integer comparison operation"; let description = [{ The `cmpi` operation is a generic comparison for integer-like types. Its two @@ -1555,7 +1555,7 @@ class ScalarConditionOrMatchingShape names> : def SelectOp : Arith_Op<"select", [Pure, AllTypesMatch<["true_value", "false_value", "result"]>, ScalarConditionOrMatchingShape<["condition", "result"]>, - DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, ] # ElementwiseMappable.traits> { let summary = "select operation"; let description = [{ diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 1da68ed2176d8f..10719aae5c8b46 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -52,7 +52,7 @@ def GPU_DimensionAttr : EnumAttr; class GPU_IndexOp traits = []> : GPU_Op, + DeclareOpInterfaceMethods, DeclareOpInterfaceMethods])>, Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> { let assemblyFormat = "$dimension attr-dict"; @@ -144,7 +144,7 @@ def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> { } def GPU_LaneIdOp : GPU_Op<"lane_id", [ - Pure, DeclareOpInterfaceMethods]> { + Pure, DeclareOpInterfaceMethods]> { let description = [{ Returns the lane id within the subgroup (warp/wave). @@ -158,7 +158,7 @@ def GPU_LaneIdOp : GPU_Op<"lane_id", [ } def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [ - Pure, DeclareOpInterfaceMethods]>, + Pure, DeclareOpInterfaceMethods]>, Arguments<(ins)>, Results<(outs Index:$result)> { let description = [{ Returns the subgroup id, i.e., the index of the current subgroup within the @@ -190,7 +190,7 @@ def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> { def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [ - Pure, DeclareOpInterfaceMethods]>, + Pure, DeclareOpInterfaceMethods]>, Arguments<(ins)>, Results<(outs Index:$result)> { let description = [{ Returns the number of subgroups within a workgroup. @@ -206,7 +206,7 @@ def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [ } def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [ - Pure, DeclareOpInterfaceMethods]>, + Pure, DeclareOpInterfaceMethods]>, Arguments<(ins)>, Results<(outs Index:$result)> { let description = [{ Returns the number of threads within a subgroup. @@ -687,7 +687,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [ def GPU_LaunchOp : GPU_Op<"launch", [ AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface, - DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, RecursiveMemoryEffects]>, Arguments<(ins Variadic:$asyncDependencies, Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, diff --git a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td index c6079cb8a98c81..a30ae9f739cbc6 100644 --- a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td +++ b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td @@ -25,7 +25,7 @@ include "mlir/IR/OpBase.td" /// Base class for Index dialect operations. class IndexOp traits = []> : Op] # traits>; + [DeclareOpInterfaceMethods] # traits>; //===----------------------------------------------------------------------===// // IndexBinaryOp diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h index 05064a72ef02e7..0e107e88f5232f 100644 --- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h +++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h @@ -105,10 +105,83 @@ class ConstantIntRanges { raw_ostream &operator<<(raw_ostream &, const ConstantIntRanges &); +/// This lattice value represents the integer range of an SSA value. +class IntegerValueRange { +public: + /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)]) + /// range that is used to mark the value as unable to be analyzed further, + /// where `t` is the type of `value`. + static IntegerValueRange getMaxRange(Value value); + + /// Create an integer value range lattice value. + IntegerValueRange(ConstantIntRanges value) : value(std::move(value)) {} + + /// Create an integer value range lattice value. + IntegerValueRange(std::optional value = std::nullopt) + : value(std::move(value)) {} + + /// Whether the range is uninitialized. This happens when the state hasn't + /// been set during the analysis. + bool isUninitialized() const { return !value.has_value(); } + + /// Get the known integer value range. + const ConstantIntRanges &getValue() const { + assert(!isUninitialized()); + return *value; + } + + /// Compare two ranges. + bool operator==(const IntegerValueRange &rhs) const { + return value == rhs.value; + } + + /// Compute the least upper bound of two ranges. + static IntegerValueRange join(const IntegerValueRange &lhs, + const IntegerValueRange &rhs) { + if (lhs.isUninitialized()) + return rhs; + if (rhs.isUninitialized()) + return lhs; + return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())}; + } + + /// Print the integer value range. + void print(raw_ostream &os) const { os << value; } + +private: + /// The known integer value range. + std::optional value; +}; + +raw_ostream &operator<<(raw_ostream &, const IntegerValueRange &); + /// The type of the `setResultRanges` callback provided to ops implementing /// InferIntRangeInterface. It should be called once for each integer result /// value and be passed the ConstantIntRanges corresponding to that value. -using SetIntRangeFn = function_ref; +using SetIntRangeFn = + llvm::function_ref; + +/// Similar to SetIntRangeFn, but operating on IntegerValueRange lattice values. +/// This is the `setResultRanges` callback for the IntegerValueRange based +/// interface method. +using SetIntLatticeFn = + llvm::function_ref; + +class InferIntRangeInterface; + +namespace intrange::detail { +/// Default implementation of `inferResultRanges` which dispatches to the +/// `inferResultRangesFromOptional`. +void defaultInferResultRanges(InferIntRangeInterface interface, + ArrayRef argRanges, + SetIntLatticeFn setResultRanges); + +/// Default implementation of `inferResultRangesFromOptional` which dispatches +/// to the `inferResultRanges`. +void defaultInferResultRangesFromOptional(InferIntRangeInterface interface, + ArrayRef argRanges, + SetIntRangeFn setResultRanges); +} // end namespace intrange::detail } // end namespace mlir #include "mlir/Interfaces/InferIntRangeInterface.h.inc" diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td index dbdc526c6f10b6..6ee436ce4d6c2f 100644 --- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td +++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td @@ -28,9 +28,10 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> { Infer the bounds on the results of this op given the bounds on its arguments. For each result value or block argument (that isn't a branch argument, since the dataflow analysis handles those case), the method should call - `setValueRange` with that `Value` as an argument. When `setValueRange` - is not called for some value, it will recieve a default value of the mimimum - and maximum values for its type (the unbounded range). + `setValueRange` with that `Value` as an argument. When implemented, + `setValueRange` should be called on all result values for the operation. + When operations take non-integer inputs, the + `inferResultRangesFromOptional` method should be implemented instead. When called on an op that also implements the RegionBranchOpInterface or BranchOpInterface, this method should not attempt to infer the values @@ -39,14 +40,39 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> { This function will only be called when at least one result of the op is a scalar integer value or the op has a region. + }], + /*retTy=*/"void", + /*methodName=*/"inferResultRanges", + /*args=*/(ins "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges, + "::mlir::SetIntRangeFn":$setResultRanges), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + ::mlir::intrange::detail::defaultInferResultRangesFromOptional($_op, + argRanges, + setResultRanges); + }]>, + + InterfaceMethod<[{ + Infer the bounds on the results of this op given the lattice representation + of the bounds for its arguments. For each result value or block argument + (that isn't a branch argument, since the dataflow analysis handles + those case), the method should call `setValueRange` with that `Value` + as an argument. When implemented, `setValueRange` should be called on + all result values for the operation. - `argRanges` contains one `IntRangeAttrs` for each argument to the op in ODS - order. Non-integer arguments will have the an unbounded range of width-0 - APInts in their `argRanges` element. + This method allows for more precise implementations when operations + want to reason about inputs which may be undefined during the analysis. }], - "void", "inferResultRanges", (ins - "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges, - "::mlir::SetIntRangeFn":$setResultRanges) - >]; + /*retTy=*/"void", + /*methodName=*/"inferResultRangesFromOptional", + /*args=*/(ins "::llvm::ArrayRef<::mlir::IntegerValueRange>":$argRanges, + "::mlir::SetIntLatticeFn":$setResultRanges), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + ::mlir::intrange::detail::defaultInferResultRanges($_op, + argRanges, + setResultRanges); + }]> + ]; } #endif // MLIR_INTERFACES_INFERINTRANGEINTERFACE diff --git a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h index 851bb534bc7ee1..3988a8826498a9 100644 --- a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h +++ b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h @@ -25,7 +25,11 @@ namespace intrange { /// abstracted away here to permit writing the function that handles both /// 64- and 32-bit index types. using InferRangeFn = - function_ref)>; + std::function)>; + +/// Function that performs inferrence on an array of `IntegerValueRange`. +using InferIntegerValueRangeFn = + std::function)>; static constexpr unsigned indexMinWidth = 32; static constexpr unsigned indexMaxWidth = 64; @@ -52,7 +56,7 @@ using InferRangeWithOvfFlagsFn = /// /// The `mode` argument specifies if the unsigned, signed, or both results of /// the inference computation should be used when comparing the results. -ConstantIntRanges inferIndexOp(InferRangeFn inferFn, +ConstantIntRanges inferIndexOp(const InferRangeFn &inferFn, ArrayRef argRanges, CmpMode mode); diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp index a82c30717e275b..9721620807a0f0 100644 --- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp @@ -36,17 +36,6 @@ using namespace mlir; using namespace mlir::dataflow; -IntegerValueRange IntegerValueRange::getMaxRange(Value value) { - unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType()); - if (width == 0) - return {}; - APInt umin = APInt::getMinValue(width); - APInt umax = APInt::getMaxValue(width); - APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin; - APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax; - return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}}; -} - void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const { Lattice::onUpdate(solver); @@ -72,24 +61,17 @@ void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const { void IntegerRangeAnalysis::visitOperation( Operation *op, ArrayRef operands, ArrayRef results) { - // If the lattice on any operand is unitialized, bail out. - if (llvm::any_of(operands, [](const IntegerValueRangeLattice *lattice) { - return lattice->getValue().isUninitialized(); - })) { - return; - } - auto inferrable = dyn_cast(op); if (!inferrable) return setAllToEntryStates(results); LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n"); - SmallVector argRanges( - llvm::map_range(operands, [](const IntegerValueRangeLattice *val) { - return val->getValue().getValue(); - })); + auto argRanges = llvm::map_to_vector( + operands, [](const IntegerValueRangeLattice *lattice) { + return lattice->getValue(); + }); - auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) { + auto joinCallback = [&](Value v, const IntegerValueRange &attrs) { auto result = dyn_cast(v); if (!result) return; @@ -99,7 +81,7 @@ void IntegerRangeAnalysis::visitOperation( IntegerValueRangeLattice *lattice = results[result.getResultNumber()]; IntegerValueRange oldRange = lattice->getValue(); - ChangeResult changed = lattice->join(IntegerValueRange{attrs}); + ChangeResult changed = lattice->join(attrs); // Catch loop results with loop variant bounds and conservatively make // them [-inf, inf] so we don't circle around infinitely often (because @@ -116,7 +98,7 @@ void IntegerRangeAnalysis::visitOperation( propagateIfChanged(lattice, changed); }; - inferrable.inferResultRanges(argRanges, joinCallback); + inferrable.inferResultRangesFromOptional(argRanges, joinCallback); } void IntegerRangeAnalysis::visitNonControlFlowArguments( @@ -124,17 +106,12 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( ArrayRef argLattices, unsigned firstIndex) { if (auto inferrable = dyn_cast(op)) { LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n"); - // If the lattice on any operand is unitialized, bail out. - if (llvm::any_of(op->getOperands(), [&](Value value) { - return getLatticeElementFor(op, value)->getValue().isUninitialized(); - })) - return; - SmallVector argRanges( - llvm::map_range(op->getOperands(), [&](Value value) { - return getLatticeElementFor(op, value)->getValue().getValue(); - })); - auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) { + auto argRanges = llvm::map_to_vector(op->getOperands(), [&](Value value) { + return getLatticeElementFor(op, value)->getValue(); + }); + + auto joinCallback = [&](Value v, const IntegerValueRange &attrs) { auto arg = dyn_cast(v); if (!arg) return; @@ -145,7 +122,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( IntegerValueRangeLattice *lattice = argLattices[arg.getArgNumber()]; IntegerValueRange oldRange = lattice->getValue(); - ChangeResult changed = lattice->join(IntegerValueRange{attrs}); + ChangeResult changed = lattice->join(attrs); // Catch loop results with loop variant bounds and conservatively make // them [-inf, inf] so we don't circle around infinitely often (because @@ -162,7 +139,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( propagateIfChanged(lattice, changed); }; - inferrable.inferResultRanges(argRanges, joinCallback); + inferrable.inferResultRangesFromOptional(argRanges, joinCallback); return; } diff --git a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp index fbe2ecab8adcaa..462044417b5fb8 100644 --- a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp +++ b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp @@ -295,18 +295,24 @@ void arith::CmpIOp::inferResultRanges(ArrayRef argRanges, // SelectOp //===----------------------------------------------------------------------===// -void arith::SelectOp::inferResultRanges(ArrayRef argRanges, - SetIntRangeFn setResultRange) { - std::optional mbCondVal = argRanges[0].getConstantValue(); +void arith::SelectOp::inferResultRangesFromOptional( + ArrayRef argRanges, SetIntLatticeFn setResultRange) { + std::optional mbCondVal = + argRanges[0].isUninitialized() + ? std::nullopt + : argRanges[0].getValue().getConstantValue(); + + const IntegerValueRange &trueCase = argRanges[1]; + const IntegerValueRange &falseCase = argRanges[2]; if (mbCondVal) { if (mbCondVal->isZero()) - setResultRange(getResult(), argRanges[2]); + setResultRange(getResult(), falseCase); else - setResultRange(getResult(), argRanges[1]); + setResultRange(getResult(), trueCase); return; } - setResultRange(getResult(), argRanges[1].rangeUnion(argRanges[2])); + setResultRange(getResult(), IntegerValueRange::join(trueCase, falseCase)); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Interfaces/InferIntRangeInterface.cpp b/mlir/lib/Interfaces/InferIntRangeInterface.cpp index b3f6c0ee3cc32d..d879b93586899b 100644 --- a/mlir/lib/Interfaces/InferIntRangeInterface.cpp +++ b/mlir/lib/Interfaces/InferIntRangeInterface.cpp @@ -126,3 +126,51 @@ raw_ostream &mlir::operator<<(raw_ostream &os, const ConstantIntRanges &range) { return os << "unsigned : [" << range.umin() << ", " << range.umax() << "] signed : [" << range.smin() << ", " << range.smax() << "]"; } + +IntegerValueRange IntegerValueRange::getMaxRange(Value value) { + unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType()); + if (width == 0) + return {}; + + APInt umin = APInt::getMinValue(width); + APInt umax = APInt::getMaxValue(width); + APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin; + APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax; + return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}}; +} + +raw_ostream &mlir::operator<<(raw_ostream &os, const IntegerValueRange &range) { + range.print(os); + return os; +} + +void mlir::intrange::detail::defaultInferResultRanges( + InferIntRangeInterface interface, ArrayRef argRanges, + SetIntLatticeFn setResultRanges) { + llvm::SmallVector unpacked; + unpacked.reserve(argRanges.size()); + + for (const IntegerValueRange &range : argRanges) { + if (range.isUninitialized()) + return; + unpacked.push_back(range.getValue()); + } + + interface.inferResultRanges( + unpacked, + [&setResultRanges](Value value, const ConstantIntRanges &argRanges) { + setResultRanges(value, IntegerValueRange{argRanges}); + }); +} + +void mlir::intrange::detail::defaultInferResultRangesFromOptional( + InferIntRangeInterface interface, ArrayRef argRanges, + SetIntRangeFn setResultRanges) { + auto ranges = llvm::to_vector_of(argRanges); + interface.inferResultRangesFromOptional( + ranges, + [&setResultRanges](Value value, const IntegerValueRange &argRanges) { + if (!argRanges.isUninitialized()) + setResultRanges(value, argRanges.getValue()); + }); +} diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp index fe1a67d6287386..5b8d35e7bd5197 100644 --- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp +++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp @@ -76,7 +76,7 @@ static ConstantIntRanges minMaxBy(ConstArithFn op, ArrayRef lhs, //===----------------------------------------------------------------------===// ConstantIntRanges -mlir::intrange::inferIndexOp(InferRangeFn inferFn, +mlir::intrange::inferIndexOp(const InferRangeFn &inferFn, ArrayRef argRanges, intrange::CmpMode mode) { ConstantIntRanges sixtyFour = inferFn(argRanges); diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir index 5b538197a0c117..60f0ab41afa48d 100644 --- a/mlir/test/Dialect/Arith/int-range-interface.mlir +++ b/mlir/test/Dialect/Arith/int-range-interface.mlir @@ -899,3 +899,22 @@ func.func @test_shl_i8_nowrap() -> i8 { %2 = test.reflect_bounds %1 : i8 return %2: i8 } + +/// A test case to ensure that the ranges for unsupported ops are initialized +/// properly to maxRange, rather than left uninitialized. +/// In this test case, the previous behavior would leave the ranges for %a and +/// %b uninitialized, resulting in arith.cmpf's range not being updated, even +/// though it has an integer valued result. + +// CHECK-LABEL: func @test_cmpf_propagates +// CHECK: test.reflect_bounds {smax = 2 : index, smin = 1 : index, umax = 2 : index, umin = 1 : index} +func.func @test_cmpf_propagates(%a: f32, %b: f32) -> index { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + %0 = arith.cmpf ueq, %a, %b : f32 + %1 = arith.select %0, %c1, %c2 : index + %2 = test.reflect_bounds %1 : index + func.return %2 : index +} + diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index 18324482153a54..9d7e0a7928ab8d 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -2750,7 +2750,7 @@ def TestGraphLoopOp : TEST_Op<"graph_loop", def InferIntRangeType : AnyTypeOf<[AnyInteger, Index]>; def TestWithBoundsOp : TEST_Op<"with_bounds", - [DeclareOpInterfaceMethods, + [DeclareOpInterfaceMethods, NoMemoryEffect]> { let arguments = (ins APIntAttr:$umin, APIntAttr:$umax, @@ -2762,7 +2762,7 @@ def TestWithBoundsOp : TEST_Op<"with_bounds", } def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region", - [DeclareOpInterfaceMethods, + [DeclareOpInterfaceMethods, SingleBlock, NoTerminator]> { let arguments = (ins APIntAttr:$umin, APIntAttr:$umax, @@ -2774,7 +2774,7 @@ def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region", } def TestIncrementOp : TEST_Op<"increment", - [DeclareOpInterfaceMethods, + [DeclareOpInterfaceMethods, NoMemoryEffect, AllTypesMatch<["value", "result"]>]> { let arguments = (ins InferIntRangeType:$value); let results = (outs InferIntRangeType:$result); @@ -2783,7 +2783,8 @@ def TestIncrementOp : TEST_Op<"increment", } def TestReflectBoundsOp : TEST_Op<"reflect_bounds", - [DeclareOpInterfaceMethods, AllTypesMatch<["value", "result"]>]> { + [DeclareOpInterfaceMethods, + AllTypesMatch<["value", "result"]>]> { let arguments = (ins InferIntRangeType:$value, OptionalAttr:$umin, OptionalAttr:$umax, From 20d497c26fc95c80a1bacb38820d92e5f52bec58 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 28 May 2024 15:33:59 -0700 Subject: [PATCH 77/89] [Driver] Remove unneeded *-linux-gnu after D158183 Recommit 435ea21c897f94b5a3777a9f152e4c5bb4a371a3. As the comment added by a07727199db0525e9d2df41e466a2a1611b3c8e1 suggests, these `*Triples` lists should shrink over time. https://reviews.llvm.org/D158183 allows *-unknown-linux-gnu to detect *-linux-gnu. If we additionally allow x86_64-unknown-linux-gnu -m32/-mx32 to detect x86_64-linux-gnu, we can mostly remove these *-linux-gnu elements. Retain x86_64-linux-gnu for now to work around #93609. (In addition, Debian /usr/bin/clang --version uses x86_64-pc-linux-gnu). Retain i586-linux-gnu for now to work around #93502. --- clang/lib/Driver/ToolChains/Gnu.cpp | 69 ++++++++++++++--------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 9849c59685cca7..b141e5f2adfab1 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2227,10 +2227,19 @@ void Generic_GCC::GCCInstallationDetector::init( SmallVector CandidateBiarchTripleAliases; // Add some triples that we want to check first. CandidateTripleAliases.push_back(TargetTriple.str()); - std::string TripleNoVendor = TargetTriple.getArchName().str() + "-" + - TargetTriple.getOSAndEnvironmentName().str(); - if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor) + std::string TripleNoVendor, BiarchTripleNoVendor; + if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor) { + StringRef OSEnv = TargetTriple.getOSAndEnvironmentName(); + if (TargetTriple.getEnvironment() == llvm::Triple::GNUX32) + OSEnv = "linux-gnu"; + TripleNoVendor = (TargetTriple.getArchName().str() + '-' + OSEnv).str(); CandidateTripleAliases.push_back(TripleNoVendor); + if (BiarchVariantTriple.getArch() != llvm::Triple::UnknownArch) { + BiarchTripleNoVendor = + (BiarchVariantTriple.getArchName().str() + '-' + OSEnv).str(); + CandidateBiarchTripleAliases.push_back(BiarchTripleNoVendor); + } + } CollectLibDirsAndTriples(TargetTriple, BiarchVariantTriple, CandidateLibDirs, CandidateTripleAliases, CandidateBiarchLibDirs, @@ -2453,11 +2462,9 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes( // lists should shrink over time. Please don't add more elements to *Triples. static const char *const AArch64LibDirs[] = {"/lib64", "/lib"}; static const char *const AArch64Triples[] = { - "aarch64-none-linux-gnu", "aarch64-linux-gnu", "aarch64-redhat-linux", - "aarch64-suse-linux"}; + "aarch64-none-linux-gnu", "aarch64-redhat-linux", "aarch64-suse-linux"}; static const char *const AArch64beLibDirs[] = {"/lib"}; - static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu", - "aarch64_be-linux-gnu"}; + static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu"}; static const char *const ARMLibDirs[] = {"/lib"}; static const char *const ARMTriples[] = {"arm-linux-gnueabi"}; @@ -2482,9 +2489,8 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes( "x86_64-linux-gnu", "x86_64-unknown-linux-gnu", "x86_64-pc-linux-gnu", "x86_64-redhat-linux6E", "x86_64-redhat-linux", "x86_64-suse-linux", - "x86_64-manbo-linux-gnu", "x86_64-linux-gnu", - "x86_64-slackware-linux", "x86_64-unknown-linux", - "x86_64-amazon-linux"}; + "x86_64-manbo-linux-gnu", "x86_64-slackware-linux", + "x86_64-unknown-linux", "x86_64-amazon-linux"}; static const char *const X32Triples[] = {"x86_64-linux-gnux32", "x86_64-pc-linux-gnux32"}; static const char *const X32LibDirs[] = {"/libx32", "/lib"}; @@ -2500,26 +2506,24 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes( "loongarch64-linux-gnu", "loongarch64-unknown-linux-gnu"}; static const char *const M68kLibDirs[] = {"/lib"}; - static const char *const M68kTriples[] = { - "m68k-linux-gnu", "m68k-unknown-linux-gnu", "m68k-suse-linux"}; + static const char *const M68kTriples[] = {"m68k-unknown-linux-gnu", + "m68k-suse-linux"}; static const char *const MIPSLibDirs[] = {"/libo32", "/lib"}; static const char *const MIPSTriples[] = { "mips-linux-gnu", "mips-mti-linux", "mips-mti-linux-gnu", "mips-img-linux-gnu", "mipsisa32r6-linux-gnu"}; static const char *const MIPSELLibDirs[] = {"/libo32", "/lib"}; - static const char *const MIPSELTriples[] = { - "mipsel-linux-gnu", "mips-img-linux-gnu", "mipsisa32r6el-linux-gnu"}; + static const char *const MIPSELTriples[] = {"mipsel-linux-gnu", + "mips-img-linux-gnu"}; static const char *const MIPS64LibDirs[] = {"/lib64", "/lib"}; static const char *const MIPS64Triples[] = { - "mips64-linux-gnu", "mips-mti-linux-gnu", - "mips-img-linux-gnu", "mips64-linux-gnuabi64", + "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64-linux-gnuabi64", "mipsisa64r6-linux-gnu", "mipsisa64r6-linux-gnuabi64"}; static const char *const MIPS64ELLibDirs[] = {"/lib64", "/lib"}; static const char *const MIPS64ELTriples[] = { - "mips64el-linux-gnu", "mips-mti-linux-gnu", - "mips-img-linux-gnu", "mips64el-linux-gnuabi64", + "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64el-linux-gnuabi64", "mipsisa64r6el-linux-gnu", "mipsisa64r6el-linux-gnuabi64"}; static const char *const MIPSN32LibDirs[] = {"/lib32"}; @@ -2534,46 +2538,39 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes( static const char *const PPCLibDirs[] = {"/lib32", "/lib"}; static const char *const PPCTriples[] = { - "powerpc-linux-gnu", "powerpc-unknown-linux-gnu", "powerpc-linux-gnuspe", + "powerpc-unknown-linux-gnu", // On 32-bit PowerPC systems running SUSE Linux, gcc is configured as a // 64-bit compiler which defaults to "-m32", hence "powerpc64-suse-linux". "powerpc64-suse-linux", "powerpc-montavista-linuxspe"}; static const char *const PPCLELibDirs[] = {"/lib32", "/lib"}; - static const char *const PPCLETriples[] = {"powerpcle-linux-gnu", - "powerpcle-unknown-linux-gnu", + static const char *const PPCLETriples[] = {"powerpcle-unknown-linux-gnu", "powerpcle-linux-musl"}; static const char *const PPC64LibDirs[] = {"/lib64", "/lib"}; - static const char *const PPC64Triples[] = { - "powerpc64-linux-gnu", "powerpc64-unknown-linux-gnu", - "powerpc64-suse-linux", "ppc64-redhat-linux"}; + static const char *const PPC64Triples[] = {"powerpc64-unknown-linux-gnu", + "powerpc64-suse-linux", + "ppc64-redhat-linux"}; static const char *const PPC64LELibDirs[] = {"/lib64", "/lib"}; static const char *const PPC64LETriples[] = { - "powerpc64le-linux-gnu", "powerpc64le-unknown-linux-gnu", - "powerpc64le-none-linux-gnu", "powerpc64le-suse-linux", - "ppc64le-redhat-linux"}; + "powerpc64le-unknown-linux-gnu", "powerpc64le-none-linux-gnu", + "powerpc64le-suse-linux", "ppc64le-redhat-linux"}; static const char *const RISCV32LibDirs[] = {"/lib32", "/lib"}; static const char *const RISCV32Triples[] = {"riscv32-unknown-linux-gnu", - "riscv32-linux-gnu", "riscv32-unknown-elf"}; static const char *const RISCV64LibDirs[] = {"/lib64", "/lib"}; static const char *const RISCV64Triples[] = {"riscv64-unknown-linux-gnu", - "riscv64-linux-gnu", "riscv64-unknown-elf"}; static const char *const SPARCv8LibDirs[] = {"/lib32", "/lib"}; - static const char *const SPARCv8Triples[] = {"sparc-linux-gnu", - "sparcv8-linux-gnu"}; + static const char *const SPARCv8Triples[] = {"sparcv8-linux-gnu"}; static const char *const SPARCv9LibDirs[] = {"/lib64", "/lib"}; - static const char *const SPARCv9Triples[] = {"sparc64-linux-gnu", - "sparcv9-linux-gnu"}; + static const char *const SPARCv9Triples[] = {"sparcv9-linux-gnu"}; static const char *const SystemZLibDirs[] = {"/lib64", "/lib"}; static const char *const SystemZTriples[] = { - "s390x-linux-gnu", "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu", - "s390x-suse-linux", "s390x-redhat-linux"}; - + "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu", "s390x-suse-linux", + "s390x-redhat-linux"}; using std::begin; using std::end; From 760c2aa55f0c5f56bed944328b23aa3f2f764346 Mon Sep 17 00:00:00 2001 From: PiJoules <6019989+PiJoules@users.noreply.github.com> Date: Tue, 28 May 2024 15:37:03 -0700 Subject: [PATCH 78/89] [lld] Support thumb PLTs (#86223) We are using PLTs for cortex-m33 which only supports thumb. More specifically, this is for a very restricted use case. There's no MMU so there's no sharing of virtual addresses between two processes, but this is fine. The MCU is used for running [chre nanoapps](https://android.googlesource.com/platform/system/chre/+/HEAD/doc/nanoapp_overview.md) for android. Each nanoapp is a shared library (but effectively acts as an executable containing a test suite) that is loaded and run on the MCU one binary at a time and there's only one process running at a time, so we ensure that the same text segment cannot be shared by two different running executables. GNU LD supports thumb PLTs but we want to migrate to a clang toolchain and use LLD, so thumb PLTs are needed. --- lld/ELF/Arch/ARM.cpp | 176 +++++++++++++++++++-------- lld/ELF/Config.h | 1 + lld/ELF/InputFiles.cpp | 12 ++ lld/test/ELF/armv8-thumb-plt-reloc.s | 126 +++++++++++++++++++ 4 files changed, 262 insertions(+), 53 deletions(-) create mode 100644 lld/test/ELF/armv8-thumb-plt-reloc.s diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 687f9499009d5e..3e0efe540e1bf1 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -231,36 +231,71 @@ static void writePltHeaderLong(uint8_t *buf) { // The default PLT header requires the .got.plt to be within 128 Mb of the // .plt in the positive direction. void ARM::writePltHeader(uint8_t *buf) const { - // Use a similar sequence to that in writePlt(), the difference is the calling - // conventions mean we use lr instead of ip. The PLT entry is responsible for - // saving lr on the stack, the dynamic loader is responsible for reloading - // it. - const uint32_t pltData[] = { - 0xe52de004, // L1: str lr, [sp,#-4]! - 0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4) - 0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4) - 0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4) - }; - - uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4; - if (!llvm::isUInt<27>(offset)) { - // We cannot encode the Offset, use the long form. - writePltHeaderLong(buf); - return; + if (config->armThumbPLTs) { + // The instruction sequence for thumb: + // + // 0: b500 push {lr} + // 2: f8df e008 ldr.w lr, [pc, #0x8] @ 0xe + // 6: 44fe add lr, pc + // 8: f85e ff08 ldr pc, [lr, #8]! + // e: .word .got.plt - .plt - 16 + // + // At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from + // `pc` in the add instruction and 8 bytes for the `lr` adjustment. + // + uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16; + assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset"); + write16(buf + 0, 0xb500); + // Split into two halves to support endianness correctly. + write16(buf + 2, 0xf8df); + write16(buf + 4, 0xe008); + write16(buf + 6, 0x44fe); + // Split into two halves to support endianness correctly. + write16(buf + 8, 0xf85e); + write16(buf + 10, 0xff08); + write32(buf + 12, offset); + + memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary + memcpy(buf + 20, trapInstr.data(), 4); + memcpy(buf + 24, trapInstr.data(), 4); + memcpy(buf + 28, trapInstr.data(), 4); + } else { + // Use a similar sequence to that in writePlt(), the difference is the + // calling conventions mean we use lr instead of ip. The PLT entry is + // responsible for saving lr on the stack, the dynamic loader is responsible + // for reloading it. + const uint32_t pltData[] = { + 0xe52de004, // L1: str lr, [sp,#-4]! + 0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4) + 0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4) + 0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4) + }; + + uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4; + if (!llvm::isUInt<27>(offset)) { + // We cannot encode the Offset, use the long form. + writePltHeaderLong(buf); + return; + } + write32(buf + 0, pltData[0]); + write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff)); + write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff)); + write32(buf + 12, pltData[3] | (offset & 0xfff)); + memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary + memcpy(buf + 20, trapInstr.data(), 4); + memcpy(buf + 24, trapInstr.data(), 4); + memcpy(buf + 28, trapInstr.data(), 4); } - write32(buf + 0, pltData[0]); - write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff)); - write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff)); - write32(buf + 12, pltData[3] | (offset & 0xfff)); - memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary - memcpy(buf + 20, trapInstr.data(), 4); - memcpy(buf + 24, trapInstr.data(), 4); - memcpy(buf + 28, trapInstr.data(), 4); } void ARM::addPltHeaderSymbols(InputSection &isec) const { - addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec); - addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec); + if (config->armThumbPLTs) { + addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec); + addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec); + } else { + addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec); + addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec); + } } // Long form PLT entries that do not have any restrictions on the displacement @@ -279,32 +314,65 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr, // .plt in the positive direction. void ARM::writePlt(uint8_t *buf, const Symbol &sym, uint64_t pltEntryAddr) const { - // The PLT entry is similar to the example given in Appendix A of ELF for - // the Arm Architecture. Instead of using the Group Relocations to find the - // optimal rotation for the 8-bit immediate used in the add instructions we - // hard code the most compact rotations for simplicity. This saves a load - // instruction over the long plt sequences. - const uint32_t pltData[] = { - 0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8 - 0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8 - 0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8 - }; - uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8; - if (!llvm::isUInt<27>(offset)) { - // We cannot encode the Offset, use the long form. - writePltLong(buf, sym.getGotPltVA(), pltEntryAddr); - return; + if (!config->armThumbPLTs) { + uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8; + + // The PLT entry is similar to the example given in Appendix A of ELF for + // the Arm Architecture. Instead of using the Group Relocations to find the + // optimal rotation for the 8-bit immediate used in the add instructions we + // hard code the most compact rotations for simplicity. This saves a load + // instruction over the long plt sequences. + const uint32_t pltData[] = { + 0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8 + 0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8 + 0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8 + }; + if (!llvm::isUInt<27>(offset)) { + // We cannot encode the Offset, use the long form. + writePltLong(buf, sym.getGotPltVA(), pltEntryAddr); + return; + } + write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff)); + write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff)); + write32(buf + 8, pltData[2] | (offset & 0xfff)); + memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary + } else { + uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12; + assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset"); + + // A PLT entry will be: + // + // movw ip, # + // movt ip, # + // add ip, pc + // L1: ldr.w pc, [ip] + // b L1 + // + // where ip = r12 = 0xc + + // movw ip, # + write16(buf + 2, 0x0c00); // use `ip` + relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, offset); + + // movt ip, # + write16(buf + 6, 0x0c00); // use `ip` + relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, offset); + + write16(buf + 8, 0x44fc); // add ip, pc + write16(buf + 10, 0xf8dc); // ldr.w pc, [ip] (bottom half) + write16(buf + 12, 0xf000); // ldr.w pc, [ip] (upper half) + write16(buf + 14, 0xe7fc); // Branch to previous instruction } - write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff)); - write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff)); - write32(buf + 8, pltData[2] | (offset & 0xfff)); - memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary } void ARM::addPltSymbols(InputSection &isec, uint64_t off) const { - addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec); - addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec); + if (config->armThumbPLTs) { + addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec); + } else { + addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec); + addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec); + } } bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, @@ -325,6 +393,8 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, case R_ARM_JUMP24: // Source is ARM, all PLT entries are ARM so no interworking required. // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb). + assert(!config->armThumbPLTs && + "If the source is ARM, we should not need Thumb PLTs"); if (s.isFunc() && expr == R_PC && (s.getVA() & 1)) return true; [[fallthrough]]; @@ -335,9 +405,9 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, } case R_ARM_THM_JUMP19: case R_ARM_THM_JUMP24: - // Source is Thumb, all PLT entries are ARM so interworking is required. + // Source is Thumb, when all PLT entries are ARM interworking is required. // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM). - if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0)) + if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0)) return true; [[fallthrough]]; case R_ARM_THM_CALL: { @@ -547,7 +617,6 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { // STT_FUNC we choose whether to write a BL or BLX depending on the // value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is // not of type STT_FUNC then we must preserve the original instruction. - // PLT entries are always ARM state so we know we don't need to interwork. assert(rel.sym); // R_ARM_CALL is always reached via relocate(). bool bit0Thumb = val & 1; bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000; @@ -606,12 +675,13 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { // PLT entries are always ARM state so we know we need to interwork. assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate(). bool bit0Thumb = val & 1; + bool useThumb = bit0Thumb || config->armThumbPLTs; bool isBlx = (read16(loc + 2) & 0x1000) == 0; // lld 10.0 and before always used bit0Thumb when deciding to write a BLX - // even when type not STT_FUNC. PLT entries generated by LLD are always ARM. - if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb) + // even when type not STT_FUNC. + if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb) stateChangeWarning(loc, rel.type, *rel.sym); - if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) { + if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) { // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As // the BLX instruction may only be two byte aligned. This must be done // before overflow check. diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index f0dfe7f377de0e..883c4a2f84294c 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -217,6 +217,7 @@ struct Config { bool allowMultipleDefinition; bool fatLTOObjects; bool androidPackDynRelocs = false; + bool armThumbPLTs = false; bool armHasBlx = false; bool armHasMovtMovw = false; bool armJ1J2BranchEncoding = false; diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index 1f496026d3ae20..d760dddcf5ec5c 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -194,6 +194,18 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) { if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base && profile == ARMBuildAttrs::MicroControllerProfile) config->armCMSESupport = true; + + // The thumb PLT entries require Thumb2 which can be used on multiple archs. + // For now, let's limit it to ones where ARM isn't available and we know have + // Thumb2. + std::optional armISA = + attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use); + std::optional thumb = + attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use); + bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed; + bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32; + if (noArmISA && hasThumb2) + config->armThumbPLTs = true; } InputFile::InputFile(Kind k, MemoryBufferRef m) diff --git a/lld/test/ELF/armv8-thumb-plt-reloc.s b/lld/test/ELF/armv8-thumb-plt-reloc.s new file mode 100644 index 00000000000000..47cd5c1b741ee0 --- /dev/null +++ b/lld/test/ELF/armv8-thumb-plt-reloc.s @@ -0,0 +1,126 @@ +// REQUIRES: arm +// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1 +// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %s -o %t2 +// RUN: ld.lld %t1 %t2 -o %t +// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s +// RUN: ld.lld -shared %t1 %t2 -o %t.so +// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s +// RUN: llvm-readelf -S -r %t.so | FileCheck -check-prefix=DSOREL %s + +// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.be +// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %s -o %t2.be +// RUN: ld.lld %t1.be %t2.be -o %t.be +// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s +// RUN: ld.lld -shared %t1.be %t2.be -o %t.so.be +// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s +// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s + +// RUN: ld.lld --be8 %t1.be %t2.be -o %t.be +// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s +// RUN: ld.lld --be8 -shared %t1.be %t2.be -o %t.so.be +// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s +// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s + +/// Test PLT entry generation + .text + .align 2 + .globl _start + .type _start,%function +_start: + bl func1 + bl func2 + bl func3 + b.w func1 + b.w func2 + b.w func3 + beq.w func1 + beq.w func2 + beq.w func3 + +/// Executable, expect no PLT +// CHECK: Disassembly of section .text: +// CHECK-EMPTY: +// CHECK-NEXT: : +// CHECK-NEXT: bx lr +// CHECK: : +// CHECK-NEXT: bx lr +// CHECK: : +// CHECK-NEXT: bx lr +// CHECK-NEXT: d4d4 +// CHECK: <_start>: +// CHECK-NEXT: bl {{.*}} +// CHECK-NEXT: bl {{.*}} +// CHECK-NEXT: bl {{.*}} +// CHECK-NEXT: b.w {{.*}} +// CHECK-NEXT: b.w {{.*}} +// CHECK-NEXT: b.w {{.*}} +// CHECK-NEXT: beq.w {{.*}} +// CHECK-NEXT: beq.w {{.*}} +// CHECK-NEXT: beq.w {{.*}} + +// DSO: Disassembly of section .text: +// DSO-EMPTY: +// DSO-NEXT: : +// DSO-NEXT: bx lr +// DSO: : +// DSO-NEXT: bx lr +// DSO: : +// DSO-NEXT: bx lr +// DSO-NEXT: d4d4 +// DSO: <_start>: +/// 0x10260 = PLT func1 +// DSO-NEXT: bl 0x10260 +/// 0x10270 = PLT func2 +// DSO-NEXT: bl 0x10270 +/// 0x10280 = PLT func3 +// DSO-NEXT: bl 0x10280 +/// 0x10260 = PLT func1 +// DSO-NEXT: b.w 0x10260 +/// 0x10270 = PLT func2 +// DSO-NEXT: b.w 0x10270 +/// 0x10280 = PLT func3 +// DSO-NEXT: b.w 0x10280 +/// 0x10260 = PLT func1 +// DSO-NEXT: beq.w 0x10260 +/// 0x10270 = PLT func2 +// DSO-NEXT: beq.w 0x10270 +/// 0x10280 = PLT func3 +// DSO-NEXT: beq.w 0x10280 +// DSO: Disassembly of section .plt: +// DSO-EMPTY: +// DSO-NEXT: 10240 <.plt>: +// DSO-NEXT: push {lr} +// DSO-NEXT: ldr.w lr, [pc, #8] +// DSO-NEXT: add lr, pc +// DSO-NEXT: ldr pc, [lr, #8]! +/// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8 +// DSO-NEXT: .word 0x00020098 +// DSO-NEXT: .word 0xd4d4d4d4 +// DSO-NEXT: .word 0xd4d4d4d4 +// DSO-NEXT: .word 0xd4d4d4d4 +// DSO-NEXT: .word 0xd4d4d4d4 + +/// 136 + 2 << 16 + 0x1026c = 0x302f4 = got entry 1 +// DSO-NEXT: 10260: f240 0c88 movw r12, #136 +// DSO-NEXT: f2c0 0c02 movt r12, #2 +// DSO-NEXT: 44fc add r12, pc +// DSO-NEXT: f8dc f000 ldr.w pc, [r12] +// DSO-NEXT: e7fc b 0x1026a +/// 124 + 2 << 16 + 0x1027c = 0x302f8 = got entry 2 +// DSO-NEXT: 10270: f240 0c7c movw r12, #124 +// DSO-NEXT: f2c0 0c02 movt r12, #2 +// DSO-NEXT: 44fc add r12, pc +// DSO-NEXT: f8dc f000 ldr.w pc, [r12] +// DSO-NEXT: e7fc b 0x1027a +/// 112 + 2 << 16 + 0x1028c = 0x302fc = got entry 3 +// DSO-NEXT: 10280: f240 0c70 movw r12, #112 +// DSO-NEXT: f2c0 0c02 movt r12, #2 +// DSO-NEXT: 44fc add r12, pc +// DSO-NEXT: f8dc f000 ldr.w pc, [r12] +// DSO-NEXT: e7fc b 0x1028a + +// DSOREL: .got.plt PROGBITS 000302e8 {{.*}} 000018 00 WA 0 0 4 +// DSOREL: Relocation section '.rel.plt' +// DSOREL: 000302f4 {{.*}} R_ARM_JUMP_SLOT {{.*}} func1 +// DSOREL: 000302f8 {{.*}} R_ARM_JUMP_SLOT {{.*}} func2 +// DSOREL: 000302fc {{.*}} R_ARM_JUMP_SLOT {{.*}} func3 From f7c8a0339c64810a3c1b28d9b3b20e02a2be6232 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 May 2024 15:54:44 -0700 Subject: [PATCH 79/89] [RISCV] Combine vXi32 (mul (and (lshr X, 15), 0x10001), 0xffff) -> (bitcast (sra (v2Xi16 (bitcast X)), 15)) (#93565) Similar for i16 and i64 elements for both fixed and scalable vectors. This reduces the number of vector instructions, but increases vl/vtype toggles. This reduces some code in 525.x264_r from SPEC2017. In that usage, the vectors are fixed with a small number of elements so vsetivli can be used. This is similar to `performMulVectorCmpZeroCombine` from AArch64. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 41 +++++++ llvm/test/CodeGen/RISCV/rvv/mul-combine.ll | 117 ++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/mul-combine.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 5fc613c1b2a140..e99c6208594e3b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13704,6 +13704,44 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Combine vXi32 (mul (and (lshr X, 15), 0x10001), 0xffff) -> +// (bitcast (sra (v2Xi16 (bitcast X)), 15)) +// Same for other equivalent types with other equivalent constants. +static SDValue combineVectorMulToSraBitcast(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Do this for legal vectors unless they are i1 or i8 vectors. + if (!VT.isVector() || !TLI.isTypeLegal(VT) || VT.getScalarSizeInBits() < 16) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::AND || + N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL) + return SDValue(); + + SDValue And = N->getOperand(0); + SDValue Srl = And.getOperand(0); + + APInt V1, V2, V3; + if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) || + !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) || + !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3)) + return SDValue(); + + unsigned HalfSize = VT.getScalarSizeInBits() / 2; + if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) || + V3 != (HalfSize - 1)) + return SDValue(); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), + EVT::getIntegerVT(*DAG.getContext(), HalfSize), + VT.getVectorElementCount() * 2); + SDLoc DL(N); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, HalfVT, Srl.getOperand(0)); + SDValue Sra = DAG.getNode(ISD::SRA, DL, HalfVT, Cast, + DAG.getConstant(HalfSize - 1, DL, HalfVT)); + return DAG.getNode(ISD::BITCAST, DL, VT, Sra); +} static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -13748,6 +13786,9 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineBinOpOfZExt(N, DAG)) return V; + if (SDValue V = combineVectorMulToSraBitcast(N, DAG)) + return V; + return SDValue(); } diff --git a/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll new file mode 100644 index 00000000000000..6a7da925b4d43d --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64 + +define <2 x i16> @test_v2i16(<2 x i16> %x) { +; CHECK-RV32-LABEL: test_v2i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 7 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_v2i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 7 +; CHECK-RV64-NEXT: ret + %1 = lshr <2 x i16> %x, + %2 = and <2 x i16> %1, + %3 = mul <2 x i16> %2, + ret <2 x i16> %3 +} + +define @test_nxv2i16( %x) { +; CHECK-RV32-LABEL: test_nxv2i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-RV32-NEXT: vsrl.vi v8, v8, 7 +; CHECK-RV32-NEXT: li a0, 257 +; CHECK-RV32-NEXT: vand.vx v8, v8, a0 +; CHECK-RV32-NEXT: vsll.vi v8, v8, 8 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_nxv2i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-RV64-NEXT: vsrl.vi v8, v8, 7 +; CHECK-RV64-NEXT: li a0, 257 +; CHECK-RV64-NEXT: vand.vx v8, v8, a0 +; CHECK-RV64-NEXT: vsll.vi v8, v8, 8 +; CHECK-RV64-NEXT: ret + %1 = lshr %x, splat (i16 7) + %2 = and %1, splat (i16 257) + %3 = mul %2, splat (i16 256) + ret %3 +} + +define <2 x i32> @test_v2i32(<2 x i32> %x) { +; CHECK-RV32-LABEL: test_v2i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 15 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_v2i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 15 +; CHECK-RV64-NEXT: ret + %1 = lshr <2 x i32> %x, + %2 = and <2 x i32> %1, + %3 = mul <2 x i32> %2, + ret <2 x i32> %3 +} + +define @test_nxv2i32( %x) { +; CHECK-RV32-LABEL: test_nxv2i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 15 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_nxv2i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 15 +; CHECK-RV64-NEXT: ret + %1 = lshr %x, splat (i32 15) + %2 = and %1, splat (i32 65537) + %3 = mul %2, splat (i32 65535) + ret %3 +} + +define <2 x i64> @test_v2i64(<2 x i64> %x) { +; CHECK-RV32-LABEL: test_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 31 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 31 +; CHECK-RV64-NEXT: ret + %1 = lshr <2 x i64> %x, + %2 = and <2 x i64> %1, + %3 = mul <2 x i64> %2, + ret <2 x i64> %3 +} + +define @test_nxv2i64( %x) { +; CHECK-RV32-LABEL: test_nxv2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-RV32-NEXT: vsra.vi v8, v8, 31 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_nxv2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-RV64-NEXT: vsra.vi v8, v8, 31 +; CHECK-RV64-NEXT: ret + %1 = lshr %x, splat (i64 31) + %2 = and %1, splat (i64 4294967297) + %3 = mul %2, splat (i64 4294967295) + ret %3 +} From 0694552cb7e8b2041fd5e765cf5b83fc40664087 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 28 May 2024 15:56:17 -0700 Subject: [PATCH 80/89] [libc] clean up MutexLock (#93619) --- libc/src/__support/threads/linux/CMakeLists.txt | 1 + libc/src/__support/threads/linux/CndVar.cpp | 7 ++++--- libc/src/__support/threads/mutex.h | 14 -------------- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt index 39c4ad20201ca6..f6913ef0834289 100644 --- a/libc/src/__support/threads/linux/CMakeLists.txt +++ b/libc/src/__support/threads/linux/CMakeLists.txt @@ -75,4 +75,5 @@ add_object_library( libc.src.__support.OSUtil.osutil libc.src.__support.threads.linux.futex_word_type libc.src.__support.threads.mutex + libc.src.__support.CPP.mutex ) diff --git a/libc/src/__support/threads/linux/CndVar.cpp b/libc/src/__support/threads/linux/CndVar.cpp index daf56bca1ed21b..b3a0fdbda4e9ea 100644 --- a/libc/src/__support/threads/linux/CndVar.cpp +++ b/libc/src/__support/threads/linux/CndVar.cpp @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/__support/threads/CndVar.h" +#include "src/__support/CPP/mutex.h" #include "src/__support/OSUtil/syscall.h" // syscall_impl #include "src/__support/threads/linux/futex_word.h" // FutexWordType -#include "src/__support/threads/mutex.h" // Mutex, MutexLock +#include "src/__support/threads/mutex.h" // Mutex #include // For syscall numbers. @@ -27,7 +28,7 @@ int CndVar::wait(Mutex *m) { CndWaiter waiter; { - MutexLock ml(&qmtx); + cpp::lock_guard ml(qmtx); CndWaiter *old_back = nullptr; if (waitq_front == nullptr) { waitq_front = waitq_back = &waiter; @@ -83,7 +84,7 @@ void CndVar::notify_one() { } void CndVar::broadcast() { - MutexLock ml(&qmtx); + cpp::lock_guard ml(qmtx); uint32_t dummy_futex_word; CndWaiter *waiter = waitq_front; waitq_front = waitq_back = nullptr; diff --git a/libc/src/__support/threads/mutex.h b/libc/src/__support/threads/mutex.h index 9dded2e3f952a1..392b38984dc0ae 100644 --- a/libc/src/__support/threads/mutex.h +++ b/libc/src/__support/threads/mutex.h @@ -43,18 +43,4 @@ #include "src/__support/threads/gpu/mutex.h" #endif // __linux__ -namespace LIBC_NAMESPACE { - -// An RAII class for easy locking and unlocking of mutexes. -class MutexLock { - Mutex *mutex; - -public: - explicit MutexLock(Mutex *m) : mutex(m) { mutex->lock(); } - - ~MutexLock() { mutex->unlock(); } -}; - -} // namespace LIBC_NAMESPACE - #endif // LLVM_LIBC_SRC___SUPPORT_THREADS_MUTEX_H From c179d50fd3d84311708701d84e3bca60570d3d7f Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 28 May 2024 16:10:11 -0700 Subject: [PATCH 81/89] [WebAssembly] Add exnref type (#93586) This adds (back) the exnref type restored in the new EH proposal adopted in Oct 2023 CG meeting: https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md:x --- lld/wasm/WriterUtils.cpp | 2 ++ llvm/include/llvm/BinaryFormat/Wasm.h | 9 ++++--- llvm/include/llvm/CodeGen/ValueTypes.td | 9 ++++--- llvm/include/llvm/IR/Intrinsics.td | 2 ++ llvm/include/llvm/IR/IntrinsicsWebAssembly.td | 18 +++++++++++++ llvm/lib/CodeGen/ValueTypes.cpp | 1 + llvm/lib/Object/WasmObjectFile.cpp | 8 ++++-- llvm/lib/ObjectYAML/WasmYAML.cpp | 2 ++ .../MCTargetDesc/WebAssemblyMCTargetDesc.h | 12 +++++++++ .../WebAssemblyMCTypeUtilities.cpp | 6 +++++ .../MCTargetDesc/WebAssemblyMCTypeUtilities.h | 4 ++- .../Utils/WebAssemblyTypeUtilities.cpp | 3 +++ .../WebAssembly/WebAssemblyAsmPrinter.cpp | 2 ++ .../WebAssembly/WebAssemblyExplicitLocals.cpp | 10 +++++++ .../WebAssembly/WebAssemblyFastISel.cpp | 16 ++++++++++++ .../WebAssembly/WebAssemblyISelLowering.cpp | 3 +++ .../WebAssembly/WebAssemblyInstrInfo.td | 3 +++ .../Target/WebAssembly/WebAssemblyInstrRef.td | 8 +++--- .../WebAssembly/WebAssemblyInstrTable.td | 2 ++ .../WebAssembly/WebAssemblyRegStackify.cpp | 2 ++ .../WebAssembly/WebAssemblyRegisterInfo.td | 2 ++ .../WebAssembly/WebAssemblyUtilities.cpp | 2 ++ .../test/CodeGen/WebAssembly/reg-argument.mir | 11 ++++++++ llvm/test/CodeGen/WebAssembly/reg-copy.mir | 11 ++++++++ llvm/test/MC/WebAssembly/basic-assembly.s | 21 +++++++++------ llvm/test/MC/WebAssembly/reference-types.s | 26 +++++++++++++++++-- .../test/MC/WebAssembly/type-checker-errors.s | 16 ++++++++++++ 27 files changed, 188 insertions(+), 23 deletions(-) diff --git a/lld/wasm/WriterUtils.cpp b/lld/wasm/WriterUtils.cpp index cdd2c42f939efe..c6a1592012e64c 100644 --- a/lld/wasm/WriterUtils.cpp +++ b/lld/wasm/WriterUtils.cpp @@ -35,6 +35,8 @@ std::string toString(ValType type) { return "funcref"; case ValType::EXTERNREF: return "externref"; + case ValType::EXNREF: + return "exnref"; case ValType::OTHERREF: return "otherref"; } diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h index 38ef8e37df91d3..acf89885af6fdb 100644 --- a/llvm/include/llvm/BinaryFormat/Wasm.h +++ b/llvm/include/llvm/BinaryFormat/Wasm.h @@ -58,15 +58,16 @@ enum : unsigned { WASM_TYPE_V128 = 0x7B, WASM_TYPE_NULLFUNCREF = 0x73, WASM_TYPE_NULLEXTERNREF = 0x72, + WASM_TYPE_NULLEXNREF = 0x74, WASM_TYPE_NULLREF = 0x71, WASM_TYPE_FUNCREF = 0x70, WASM_TYPE_EXTERNREF = 0x6F, + WASM_TYPE_EXNREF = 0x69, WASM_TYPE_ANYREF = 0x6E, WASM_TYPE_EQREF = 0x6D, WASM_TYPE_I31REF = 0x6C, WASM_TYPE_STRUCTREF = 0x6B, WASM_TYPE_ARRAYREF = 0x6A, - WASM_TYPE_EXNREF = 0x69, WASM_TYPE_NONNULLABLE = 0x64, WASM_TYPE_NULLABLE = 0x63, WASM_TYPE_FUNC = 0x60, @@ -261,8 +262,9 @@ enum class ValType { V128 = WASM_TYPE_V128, FUNCREF = WASM_TYPE_FUNCREF, EXTERNREF = WASM_TYPE_EXTERNREF, + EXNREF = WASM_TYPE_EXNREF, // Unmodeled value types include ref types with heap types other than - // func or extern, and type-specialized funcrefs + // func, extern or exn, and type-specialized funcrefs OTHERREF = 0xff, }; @@ -410,7 +412,8 @@ struct WasmDataSegment { // 1) Does not model passive or declarative segments (Segment will end up with // an Offset field of i32.const 0) // 2) Does not model init exprs (Segment will get an empty Functions list) -// 2) Does not model types other than basic funcref/externref (see ValType) +// 3) Does not model types other than basic funcref/externref/exnref (see +// ValType) struct WasmElemSegment { uint32_t Flags; uint32_t TableNumber; diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index c3e378ed8f6edb..e322cc04c1c769 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -280,11 +280,12 @@ def untyped : ValueType<8, 193> { // Produces an untyped value } def funcref : ValueType<0, 194>; // WebAssembly's funcref type def externref : ValueType<0, 195>; // WebAssembly's externref type -def x86amx : ValueType<8192, 196>; // X86 AMX value -def i64x8 : ValueType<512, 197>; // 8 Consecutive GPRs (AArch64) +def exnref : ValueType<0, 196>; // WebAssembly's exnref type +def x86amx : ValueType<8192, 197>; // X86 AMX value +def i64x8 : ValueType<512, 198>; // 8 Consecutive GPRs (AArch64) def aarch64svcount - : ValueType<16, 198>; // AArch64 predicate-as-counter -def spirvbuiltin : ValueType<0, 199>; // SPIR-V's builtin type + : ValueType<16, 199>; // AArch64 predicate-as-counter +def spirvbuiltin : ValueType<0, 200>; // SPIR-V's builtin type def token : ValueType<0, 248>; // TokenTy def MetadataVT : ValueType<0, 249> { // Metadata diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 3019f68083d422..c3ac53837444ef 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -316,6 +316,7 @@ def IIT_PPCF128 : IIT_VT; def IIT_V3 : IIT_Vec<3, 53>; def IIT_EXTERNREF : IIT_VT; def IIT_FUNCREF : IIT_VT; +def IIT_EXNREF: IIT_VT; def IIT_I2 : IIT_Int<2, 57>; def IIT_I4 : IIT_Int<4, 58>; def IIT_AARCH64_SVCOUNT : IIT_VT; @@ -581,6 +582,7 @@ def llvm_vararg_ty : LLVMType; // this means vararg here def llvm_externref_ty : LLVMType; def llvm_funcref_ty : LLVMType; +def llvm_exnref_ty : LLVMType; //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td index 237f268784bb02..47aab196a6d4f9 100644 --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -31,12 +31,17 @@ def int_wasm_ref_null_extern : DefaultAttrsIntrinsic<[llvm_externref_ty], [], [IntrNoMem]>; def int_wasm_ref_null_func : DefaultAttrsIntrinsic<[llvm_funcref_ty], [], [IntrNoMem]>; +def int_wasm_ref_null_exn: + DefaultAttrsIntrinsic<[llvm_exnref_ty], [], [IntrNoMem]>; def int_wasm_ref_is_null_extern : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_externref_ty], [IntrNoMem], "llvm.wasm.ref.is_null.extern">; def int_wasm_ref_is_null_func : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_funcref_ty], [IntrNoMem], "llvm.wasm.ref.is_null.func">; +def int_wasm_ref_is_null_exn : + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_exnref_ty], [IntrNoMem], + "llvm.wasm.ref.is_null.exn">; //===----------------------------------------------------------------------===// // Table intrinsics @@ -47,6 +52,9 @@ def int_wasm_table_set_externref : def int_wasm_table_set_funcref : DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty], [IntrWriteMem]>; +def int_wasm_table_set_exnref : + DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty], + [IntrWriteMem]>; def int_wasm_table_get_externref : DefaultAttrsIntrinsic<[llvm_externref_ty], [llvm_table_ty, llvm_i32_ty], @@ -54,6 +62,9 @@ def int_wasm_table_get_externref : def int_wasm_table_get_funcref : DefaultAttrsIntrinsic<[llvm_funcref_ty], [llvm_table_ty, llvm_i32_ty], [IntrReadMem]>; +def int_wasm_table_get_exnref : + DefaultAttrsIntrinsic<[llvm_exnref_ty], [llvm_table_ty, llvm_i32_ty], + [IntrReadMem]>; // Query the current table size, and increase the current table size. def int_wasm_table_size : @@ -68,6 +79,9 @@ def int_wasm_table_grow_externref : def int_wasm_table_grow_funcref : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_table_ty, llvm_funcref_ty, llvm_i32_ty], []>; +def int_wasm_table_grow_exnref : + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_table_ty, llvm_exnref_ty, llvm_i32_ty], []>; def int_wasm_table_fill_externref : DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_externref_ty, @@ -76,6 +90,10 @@ def int_wasm_table_fill_funcref : DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty, llvm_i32_ty], []>; +def int_wasm_table_fill_exnref : + DefaultAttrsIntrinsic<[], + [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty, + llvm_i32_ty], []>; //===----------------------------------------------------------------------===// // Trapping float-to-int conversions diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 3d5c58d282da56..df1c02c3dc67c2 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -181,6 +181,7 @@ std::string EVT::getEVTString() const { case MVT::Metadata: return "Metadata"; case MVT::Untyped: return "Untyped"; case MVT::funcref: return "funcref"; + case MVT::exnref: return "exnref"; case MVT::externref: return "externref"; case MVT::aarch64svcount: return "aarch64svcount"; diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp index 6507a0e5950ebe..23381955c60a88 100644 --- a/llvm/lib/Object/WasmObjectFile.cpp +++ b/llvm/lib/Object/WasmObjectFile.cpp @@ -177,8 +177,8 @@ static uint8_t readOpcode(WasmObjectFile::ReadContext &Ctx) { static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx, uint32_t Code) { - // only directly encoded FUNCREF/EXTERNREF are supported - // (not ref null func or ref null extern) + // only directly encoded FUNCREF/EXTERNREF/EXNREF are supported + // (not ref null func, ref null extern, or ref null exn) switch (Code) { case wasm::WASM_TYPE_I32: case wasm::WASM_TYPE_I64: @@ -187,6 +187,7 @@ static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx, case wasm::WASM_TYPE_V128: case wasm::WASM_TYPE_FUNCREF: case wasm::WASM_TYPE_EXTERNREF: + case wasm::WASM_TYPE_EXNREF: return wasm::ValType(Code); } if (Code == wasm::WASM_TYPE_NULLABLE || Code == wasm::WASM_TYPE_NONNULLABLE) { @@ -1288,6 +1289,7 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) { auto ElemType = Im.Table.ElemType; if (ElemType != wasm::ValType::FUNCREF && ElemType != wasm::ValType::EXTERNREF && + ElemType != wasm::ValType::EXNREF && ElemType != wasm::ValType::OTHERREF) return make_error("invalid table element type", object_error::parse_failed); @@ -1346,6 +1348,7 @@ Error WasmObjectFile::parseTableSection(ReadContext &Ctx) { auto ElemType = Tables.back().Type.ElemType; if (ElemType != wasm::ValType::FUNCREF && ElemType != wasm::ValType::EXTERNREF && + ElemType != wasm::ValType::EXNREF && ElemType != wasm::ValType::OTHERREF) { return make_error("invalid table element type", object_error::parse_failed); @@ -1680,6 +1683,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) { Segment.ElemKind = parseValType(Ctx, ElemKind); if (Segment.ElemKind != wasm::ValType::FUNCREF && Segment.ElemKind != wasm::ValType::EXTERNREF && + Segment.ElemKind != wasm::ValType::EXNREF && Segment.ElemKind != wasm::ValType::OTHERREF) { return make_error("invalid elem type", object_error::parse_failed); diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp index 544a91d03dce01..7ad338f65706d5 100644 --- a/llvm/lib/ObjectYAML/WasmYAML.cpp +++ b/llvm/lib/ObjectYAML/WasmYAML.cpp @@ -606,6 +606,7 @@ void ScalarEnumerationTraits::enumeration( ECase(V128); ECase(FUNCREF); ECase(EXTERNREF); + ECase(EXNREF); ECase(OTHERREF); #undef ECase } @@ -640,6 +641,7 @@ void ScalarEnumerationTraits::enumeration( #define ECase(X) IO.enumCase(Type, #X, CONCAT(X)); ECase(FUNCREF); ECase(EXTERNREF); + ECase(EXNREF); ECase(OTHERREF); #undef ECase } diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 34502170a5c71f..b7498cb4299452 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -355,6 +355,8 @@ inline bool isArgument(unsigned Opc) { case WebAssembly::ARGUMENT_funcref_S: case WebAssembly::ARGUMENT_externref: case WebAssembly::ARGUMENT_externref_S: + case WebAssembly::ARGUMENT_exnref: + case WebAssembly::ARGUMENT_exnref_S: return true; default: return false; @@ -377,6 +379,8 @@ inline bool isCopy(unsigned Opc) { case WebAssembly::COPY_FUNCREF_S: case WebAssembly::COPY_EXTERNREF: case WebAssembly::COPY_EXTERNREF_S: + case WebAssembly::COPY_EXNREF: + case WebAssembly::COPY_EXNREF_S: return true; default: return false; @@ -399,6 +403,8 @@ inline bool isTee(unsigned Opc) { case WebAssembly::TEE_FUNCREF_S: case WebAssembly::TEE_EXTERNREF: case WebAssembly::TEE_EXTERNREF_S: + case WebAssembly::TEE_EXNREF: + case WebAssembly::TEE_EXNREF_S: return true; default: return false; @@ -489,6 +495,8 @@ inline bool isLocalGet(unsigned Opc) { case WebAssembly::LOCAL_GET_FUNCREF_S: case WebAssembly::LOCAL_GET_EXTERNREF: case WebAssembly::LOCAL_GET_EXTERNREF_S: + case WebAssembly::LOCAL_GET_EXNREF: + case WebAssembly::LOCAL_GET_EXNREF_S: return true; default: return false; @@ -511,6 +519,8 @@ inline bool isLocalSet(unsigned Opc) { case WebAssembly::LOCAL_SET_FUNCREF_S: case WebAssembly::LOCAL_SET_EXTERNREF: case WebAssembly::LOCAL_SET_EXTERNREF_S: + case WebAssembly::LOCAL_SET_EXNREF: + case WebAssembly::LOCAL_SET_EXNREF_S: return true; default: return false; @@ -533,6 +543,8 @@ inline bool isLocalTee(unsigned Opc) { case WebAssembly::LOCAL_TEE_FUNCREF_S: case WebAssembly::LOCAL_TEE_EXTERNREF: case WebAssembly::LOCAL_TEE_EXTERNREF_S: + case WebAssembly::LOCAL_TEE_EXNREF: + case WebAssembly::LOCAL_TEE_EXNREF_S: return true; default: return false; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp index 8ea02bd2ad1ff0..d9c8e22bbbaf5b 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp @@ -27,6 +27,7 @@ std::optional WebAssembly::parseType(StringRef Type) { wasm::ValType::V128) .Case("funcref", wasm::ValType::FUNCREF) .Case("externref", wasm::ValType::EXTERNREF) + .Case("exnref", wasm::ValType::EXNREF) .Default(std::nullopt); } @@ -40,6 +41,7 @@ WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) { .Case("v128", WebAssembly::BlockType::V128) .Case("funcref", WebAssembly::BlockType::Funcref) .Case("externref", WebAssembly::BlockType::Externref) + .Case("exnref", WebAssembly::BlockType::Exnref) .Case("void", WebAssembly::BlockType::Void) .Default(WebAssembly::BlockType::Invalid); } @@ -62,6 +64,8 @@ const char *WebAssembly::anyTypeToString(unsigned Type) { return "funcref"; case wasm::WASM_TYPE_EXTERNREF: return "externref"; + case wasm::WASM_TYPE_EXNREF: + return "exnref"; case wasm::WASM_TYPE_FUNC: return "func"; case wasm::WASM_TYPE_NORESULT: @@ -110,6 +114,8 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) { return wasm::ValType::FUNCREF; case WebAssembly::EXTERNREFRegClassID: return wasm::ValType::EXTERNREF; + case WebAssembly::EXNREFRegClassID: + return wasm::ValType::EXNREF; default: llvm_unreachable("unexpected type"); } diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h index 486cf264d13e2f..063ee4dba9068e 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h @@ -32,6 +32,7 @@ enum class BlockType : unsigned { V128 = unsigned(wasm::ValType::V128), Externref = unsigned(wasm::ValType::EXTERNREF), Funcref = unsigned(wasm::ValType::FUNCREF), + Exnref = unsigned(wasm::ValType::EXNREF), // Multivalue blocks (and other non-void blocks) are only emitted when the // blocks will never be exited and are at the ends of functions (see // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made @@ -41,7 +42,8 @@ enum class BlockType : unsigned { }; inline bool isRefType(wasm::ValType Type) { - return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF; + return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF || + Type == wasm::ValType::EXNREF; } // Convert ValType or a list/signature of ValTypes to a string. diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp index 867953b4e8d71d..f9293460e701a0 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp @@ -33,6 +33,7 @@ MVT WebAssembly::parseMVT(StringRef Type) { .Case("v2i64", MVT::v2i64) .Case("funcref", MVT::funcref) .Case("externref", MVT::externref) + .Case("exnref", MVT::exnref) .Default(MVT::INVALID_SIMPLE_VALUE_TYPE); } @@ -58,6 +59,8 @@ wasm::ValType WebAssembly::toValType(MVT Type) { return wasm::ValType::FUNCREF; case MVT::externref: return wasm::ValType::EXTERNREF; + case MVT::exnref: + return wasm::ValType::EXNREF; default: llvm_unreachable("unexpected type"); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 443558537da245..0b7ec6e74cab20 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -125,6 +125,8 @@ static char getInvokeSig(wasm::ValType VT) { return 'F'; case wasm::ValType::EXTERNREF: return 'X'; + case wasm::ValType::EXNREF: + return 'E'; default: llvm_unreachable("Unhandled wasm::ValType enum"); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp index 0159c44a79b76d..3c6a29311a10e4 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp @@ -100,6 +100,8 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) { return WebAssembly::DROP_FUNCREF; if (RC == &WebAssembly::EXTERNREFRegClass) return WebAssembly::DROP_EXTERNREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::DROP_EXNREF; llvm_unreachable("Unexpected register class"); } @@ -119,6 +121,8 @@ static unsigned getLocalGetOpcode(const TargetRegisterClass *RC) { return WebAssembly::LOCAL_GET_FUNCREF; if (RC == &WebAssembly::EXTERNREFRegClass) return WebAssembly::LOCAL_GET_EXTERNREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::LOCAL_GET_EXNREF; llvm_unreachable("Unexpected register class"); } @@ -138,6 +142,8 @@ static unsigned getLocalSetOpcode(const TargetRegisterClass *RC) { return WebAssembly::LOCAL_SET_FUNCREF; if (RC == &WebAssembly::EXTERNREFRegClass) return WebAssembly::LOCAL_SET_EXTERNREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::LOCAL_SET_EXNREF; llvm_unreachable("Unexpected register class"); } @@ -157,6 +163,8 @@ static unsigned getLocalTeeOpcode(const TargetRegisterClass *RC) { return WebAssembly::LOCAL_TEE_FUNCREF; if (RC == &WebAssembly::EXTERNREFRegClass) return WebAssembly::LOCAL_TEE_EXTERNREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::LOCAL_TEE_EXNREF; llvm_unreachable("Unexpected register class"); } @@ -176,6 +184,8 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) { return MVT::funcref; if (RC == &WebAssembly::EXTERNREFRegClass) return MVT::externref; + if (RC == &WebAssembly::EXNREFRegClass) + return MVT::exnref; llvm_unreachable("unrecognized register class"); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 26e13948bc9a68..aa3aa1b007a530 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -137,6 +137,10 @@ class WebAssemblyFastISel final : public FastISel { if (Subtarget->hasReferenceTypes()) return VT; break; + case MVT::exnref: + if (Subtarget->hasReferenceTypes() && Subtarget->hasExceptionHandling()) + return VT; + break; case MVT::f16: return MVT::f32; case MVT::v16i8: @@ -717,6 +721,10 @@ bool WebAssemblyFastISel::fastLowerArguments() { Opc = WebAssembly::ARGUMENT_externref; RC = &WebAssembly::EXTERNREFRegClass; break; + case MVT::exnref: + Opc = WebAssembly::ARGUMENT_exnref; + RC = &WebAssembly::EXNREFRegClass; + break; default: return false; } @@ -821,6 +829,9 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { case MVT::externref: ResultReg = createResultReg(&WebAssembly::EXTERNREFRegClass); break; + case MVT::exnref: + ResultReg = createResultReg(&WebAssembly::EXNREFRegClass); + break; default: return false; } @@ -948,6 +959,10 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { Opc = WebAssembly::SELECT_EXTERNREF; RC = &WebAssembly::EXTERNREFRegClass; break; + case MVT::exnref: + Opc = WebAssembly::SELECT_EXNREF; + RC = &WebAssembly::EXNREFRegClass; + break; default: return false; } @@ -1355,6 +1370,7 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) { case MVT::v2f64: case MVT::funcref: case MVT::externref: + case MVT::exnref: break; default: return false; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 518b6932a0c879..f9f16498bb390c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -76,6 +76,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( if (Subtarget->hasReferenceTypes()) { addRegisterClass(MVT::externref, &WebAssembly::EXTERNREFRegClass); addRegisterClass(MVT::funcref, &WebAssembly::FUNCREFRegClass); + if (Subtarget->hasExceptionHandling()) { + addRegisterClass(MVT::exnref, &WebAssembly::EXNREFRegClass); + } } // Compute derived properties from the register classes. computeRegisterProperties(Subtarget->getRegisterInfo()); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index c1a5a45395e87d..3d37eb2fa27bce 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -292,6 +292,7 @@ defm "": ARGUMENT; defm "": ARGUMENT; defm "": ARGUMENT; defm "": ARGUMENT; +defm "": ARGUMENT; // local.get and local.set are not generated by instruction selection; they // are implied by virtual register uses and defs. @@ -375,6 +376,8 @@ defm "" : LOCAL; defm "" : LOCAL, Requires<[HasSIMD128]>; defm "" : LOCAL, Requires<[HasReferenceTypes]>; defm "" : LOCAL, Requires<[HasReferenceTypes]>; +defm "" : LOCAL, + Requires<[HasReferenceTypes, HasExceptionHandling]>; let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in { defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm), diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td index 608963d588635e..2654a09387fd4a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td @@ -17,8 +17,9 @@ multiclass REF_I { [(set rc:$dst, (!cast("int_wasm_ref_null_" # ht)))], "ref.null_" # ht # "$dst", "ref.null_" # ht, - !cond(!eq(ht, "func") : 0xd070, - !eq(ht, "extern") : 0xd06f)>, + !cond(!eq(ht, "func") : 0xd070, + !eq(ht, "extern") : 0xd06f, + !eq(ht, "exn") : 0xd069)>, Requires<[HasReferenceTypes]>; defm SELECT_#rc: I<(outs rc:$dst), (ins rc:$lhs, rc:$rhs, I32:$cond), (outs), (ins), @@ -37,8 +38,9 @@ multiclass REF_I { defm "" : REF_I; defm "" : REF_I; +defm "" : REF_I; -foreach rc = [FUNCREF, EXTERNREF] in { +foreach rc = [FUNCREF, EXTERNREF, EXNREF] in { def : Pat<(select (i32 (setne I32:$cond, 0)), rc:$lhs, rc:$rhs), (!cast("SELECT_"#rc) rc:$lhs, rc:$rhs, I32:$cond)>; def : Pat<(select (i32 (seteq I32:$cond, 0)), rc:$lhs, rc:$rhs), diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td index 069ce5e3bc94a9..02f0ab8577c3d0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td @@ -64,6 +64,8 @@ multiclass TABLE { defm "" : TABLE, Requires<[HasReferenceTypes]>; defm "" : TABLE, Requires<[HasReferenceTypes]>; +defm "" : TABLE, + Requires<[HasReferenceTypes, HasExceptionHandling]>; def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r), (TABLE_SET_FUNCREF mcsym:$table, i32:$idx, funcref:$r)>, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index ef174e1716ef1e..d4edb6bf18d932 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -504,6 +504,8 @@ static unsigned getTeeOpcode(const TargetRegisterClass *RC) { return WebAssembly::TEE_EXTERNREF; if (RC == &WebAssembly::FUNCREFRegClass) return WebAssembly::TEE_FUNCREF; + if (RC == &WebAssembly::EXNREFRegClass) + return WebAssembly::TEE_EXNREF; llvm_unreachable("Unexpected register class"); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td index 4e2faa608be077..17889dacc868c2 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td @@ -45,6 +45,7 @@ def V128_0: WebAssemblyReg<"%v128">; def FUNCREF_0 : WebAssemblyReg<"%funcref.0">; def EXTERNREF_0 : WebAssemblyReg<"%externref.0">; +def EXNREF_0 : WebAssemblyReg<"%exnref.0">; // The value stack "register". This is an opaque entity which serves to order // uses and defs that must remain in LIFO order. @@ -68,3 +69,4 @@ def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8, 128, (add V128_0)>; def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>; def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>; +def EXNREF : WebAssemblyRegClass<[exnref], 0, (add EXNREF_0)>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index 60e872549f87d9..5e7279808cce63 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -175,6 +175,8 @@ unsigned WebAssembly::getCopyOpcodeForRegClass(const TargetRegisterClass *RC) { return WebAssembly::COPY_FUNCREF; case WebAssembly::EXTERNREFRegClassID: return WebAssembly::COPY_EXTERNREF; + case WebAssembly::EXNREFRegClassID: + return WebAssembly::COPY_EXNREF; default: llvm_unreachable("Unexpected register class"); } diff --git a/llvm/test/CodeGen/WebAssembly/reg-argument.mir b/llvm/test/CodeGen/WebAssembly/reg-argument.mir index 23e66dfc71fa1b..a549990bdb0a2b 100644 --- a/llvm/test/CodeGen/WebAssembly/reg-argument.mir +++ b/llvm/test/CodeGen/WebAssembly/reg-argument.mir @@ -68,3 +68,14 @@ body: | %1:externref = ARGUMENT_externref 0, implicit $arguments RETURN implicit-def $arguments ... +--- +name: argument_exnref +# CHECK-LABEL: argument_exnref +body: | + ; CHECK-LABEL: bb.0: + ; CHECK-NEXT: %1:exnref = ARGUMENT_exnref 0 + bb.0: + %0:i32 = CONST_I32 0, implicit-def $arguments + %1:exnref = ARGUMENT_exnref 0, implicit $arguments + RETURN implicit-def $arguments +... diff --git a/llvm/test/CodeGen/WebAssembly/reg-copy.mir b/llvm/test/CodeGen/WebAssembly/reg-copy.mir index 31a5bfa63a4ea2..763fe42d07b61a 100644 --- a/llvm/test/CodeGen/WebAssembly/reg-copy.mir +++ b/llvm/test/CodeGen/WebAssembly/reg-copy.mir @@ -77,3 +77,14 @@ body: | %0:externref = COPY %1:externref RETURN implicit-def $arguments ... +--- +name: copy_exnref +# CHECK-LABEL: copy_exnref +body: | + ; CHECK-LABEL: bb.0: + ; CHECK-NEXT: %0:exnref = COPY_EXNREF %1:exnref + ; CHECK-NEXT: RETURN + bb.0: + %0:exnref = COPY %1:exnref + RETURN implicit-def $arguments +... diff --git a/llvm/test/MC/WebAssembly/basic-assembly.s b/llvm/test/MC/WebAssembly/basic-assembly.s index 769cd7edfa8a3e..ac358c1b5c7a52 100644 --- a/llvm/test/MC/WebAssembly/basic-assembly.s +++ b/llvm/test/MC/WebAssembly/basic-assembly.s @@ -146,12 +146,14 @@ test0: .ident "clang version 9.0.0 (trunk 364502) (llvm/trunk 364571)" -.tabletype empty_eref_table, externref -empty_eref_table: +.tabletype empty_externref_table, externref +empty_externref_table: -.tabletype empty_fref_table, funcref -empty_fref_table: +.tabletype empty_funcref_table, funcref +empty_funcref_table: +.tabletype empty_exnref_table, exnref +empty_exnref_table: # CHECK: .text # CHECK: .globaltype __stack_pointer, i32 @@ -283,8 +285,11 @@ empty_fref_table: # CHECK-NEXT: .p2align 2 # CHECK-NEXT: .int32 test0 -# CHECK: .tabletype empty_eref_table, externref -# CHECK-NEXT: empty_eref_table: +# CHECK: .tabletype empty_externref_table, externref +# CHECK-NEXT: empty_externref_table: -# CHECK: .tabletype empty_fref_table, funcref -# CHECK-NEXT: empty_fref_table: +# CHECK: .tabletype empty_funcref_table, funcref +# CHECK-NEXT: empty_funcref_table: + +# CHECK: .tabletype empty_exnref_table, exnref +# CHECK-NEXT: empty_exnref_table: diff --git a/llvm/test/MC/WebAssembly/reference-types.s b/llvm/test/MC/WebAssembly/reference-types.s index ab3e3ee6b155b1..2f8bfba68dcea1 100644 --- a/llvm/test/MC/WebAssembly/reference-types.s +++ b/llvm/test/MC/WebAssembly/reference-types.s @@ -4,22 +4,27 @@ # CHECK-LABEL:ref_is_null: # CHECK: ref.is_null # encoding: [0xd1] ref_is_null: - .functype ref_is_null () -> (i32, i32) + .functype ref_is_null () -> (i32, i32, i32) ref.null_extern ref.is_null ref.null_func ref.is_null + ref.null_exn + ref.is_null end_function # CHECK-LABEL: ref_null_test: # CHECK: ref.null_func # encoding: [0xd0,0x70] # CHECK: ref.null_extern # encoding: [0xd0,0x6f] +# CHECK: ref.null_exn # encoding: [0xd0,0x69] ref_null_test: .functype ref_null_test () -> () ref.null_func drop ref.null_extern drop + ref.null_exn + drop end_function # CHECK-LABEL: ref_sig_test_funcref: @@ -36,9 +41,17 @@ ref_sig_test_externref: local.get 0 end_function +# CHECK-LABEL: ref_sig_test_exnref: +# CHECK-NEXT: .functype ref_sig_test_exnref (exnref) -> (exnref) +ref_sig_test_exnref: + .functype ref_sig_test_exnref (exnref) -> (exnref) + local.get 0 + end_function + # CHECK-LABEL: ref_select_test: # CHECK: funcref.select # encoding: [0x1b] # CHECK: externref.select # encoding: [0x1b] +# CHECK: exnref.select # encoding: [0x1b] ref_select_test: .functype ref_select_test () -> () ref.null_func @@ -51,15 +64,24 @@ ref_select_test: i32.const 0 externref.select drop + ref.null_exn + ref.null_exn + i32.const 0 + exnref.select + drop end_function # CHECK-LABEL: ref_block_test: # CHECK: block funcref # CHECK: block externref +# CHECK: block exnref ref_block_test: - .functype ref_block_test () -> (externref, funcref) + .functype ref_block_test () -> (exnref, externref, funcref) block funcref block externref + block exnref + ref.null_exn + end_block ref.null_extern end_block ref.null_func diff --git a/llvm/test/MC/WebAssembly/type-checker-errors.s b/llvm/test/MC/WebAssembly/type-checker-errors.s index 5e28d117501e98..d2841250137a8c 100644 --- a/llvm/test/MC/WebAssembly/type-checker-errors.s +++ b/llvm/test/MC/WebAssembly/type-checker-errors.s @@ -215,6 +215,22 @@ table_fill_type_mismatch_3: table.fill valid_table end_function +table_fill_type_mismatch_4: + .functype table_fill_type_mismatch_4 () -> () + ref.null_exn + i32.const 1 +# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref + table.fill valid_table + end_function + +table_fill_type_mismatch_5: + .functype table_fill_type_mismatch_5 () -> () + ref.null_exn + i32.const 1 +# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref + table.fill valid_table + end_function + table_grow_non_exist_table: .functype table_grow_non_exist_table (externref, i32) -> (i32) local.get 0 From 4486fcba756bfa4c8729673a9533578232f0bc04 Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Tue, 28 May 2024 19:14:26 -0400 Subject: [PATCH 82/89] [libc] Add proxy header for float.h. (#93504) This is the continuation of https://github.com/llvm/llvm-project/pull/88674. Fixes #88433, #90496. --------- Co-authored-by: aniplcc --- libc/hdr/CMakeLists.txt | 10 ++++++ libc/hdr/float_macros.h | 22 ++++++++++++ libc/include/llvm-libc-macros/float-macros.h | 35 ++++++++----------- .../macros/properties/CMakeLists.txt | 2 +- libc/src/__support/macros/properties/types.h | 2 +- libc/src/math/generic/CMakeLists.txt | 4 +++ libc/src/math/generic/scalbn.cpp | 11 +++--- libc/src/math/generic/scalbnf.cpp | 11 +++--- libc/src/math/generic/scalbnf128.cpp | 13 +++---- libc/src/math/generic/scalbnl.cpp | 11 +++--- .../llvm-project-overlay/libc/BUILD.bazel | 7 +++- 11 files changed, 78 insertions(+), 50 deletions(-) create mode 100644 libc/hdr/float_macros.h diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt index 91b8cb71552a71..66b82c84dac499 100644 --- a/libc/hdr/CMakeLists.txt +++ b/libc/hdr/CMakeLists.txt @@ -87,4 +87,14 @@ add_proxy_header_library( libc.include.llvm-libc-macros.time_macros ) +add_proxy_header_library( + float_macros + HDRS + float_macros.h + DEPENDS + libc.include.llvm-libc-macros.float_macros + FULL_BUILD_DEPENDS + libc.include.float +) + add_subdirectory(types) diff --git a/libc/hdr/float_macros.h b/libc/hdr/float_macros.h new file mode 100644 index 00000000000000..a0ef5e29b98687 --- /dev/null +++ b/libc/hdr/float_macros.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from math.h ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_FLOAT_MACROS_H +#define LLVM_LIBC_HDR_FLOAT_MACROS_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-macros/float-macros.h" + +#else // Overlay mode + +#include + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_FLOAT_MACROS_H diff --git a/libc/include/llvm-libc-macros/float-macros.h b/libc/include/llvm-libc-macros/float-macros.h index 4fe8590c5f70c8..81c1df868bf6cd 100644 --- a/libc/include/llvm-libc-macros/float-macros.h +++ b/libc/include/llvm-libc-macros/float-macros.h @@ -9,21 +9,6 @@ #ifndef LLVM_LIBC_MACROS_FLOAT_MACROS_H #define LLVM_LIBC_MACROS_FLOAT_MACROS_H -// Suppress `#include_next is a language extension` warnings. -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wgnu-include-next" -#pragma clang diagnostic ignored "-Winclude-next-absolute-path" -#else // gcc -#pragma GCC system_header -#endif //__clang__ - -#include_next - -#ifdef __clang__ -#pragma clang diagnostic pop -#endif //__clang__ - #ifndef FLT_RADIX #define FLT_RADIX __FLT_RADIX__ #endif // FLT_RADIX @@ -32,9 +17,13 @@ #define FLT_EVAL_METHOD __FLT_EVAL_METHOD__ #endif // FLT_EVAL_METHOD -#ifndef DECIMAL_DIG -#define DECIMAL_DIG __DECIMAL_DIG__ -#endif // DECIMAL_DIG +#ifndef FLT_ROUNDS +#if __has_builtin(__builtin_flt_rounds) +#define FLT_ROUNDS __builtin_flt_rounds() +#else +#define FLT_ROUNDS 1 +#endif +#endif // FLT_ROUNDS #ifndef FLT_DECIMAL_DIG #define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__ @@ -48,6 +37,10 @@ #define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__ #endif // LDBL_DECIMAL_DIG +#ifndef DECIMAL_DIG +#define DECIMAL_DIG __DECIMAL_DIG__ +#endif // DECIMAL_DIG + #ifndef FLT_DIG #define FLT_DIG __FLT_DIG__ #endif // FLT_DIG @@ -97,15 +90,15 @@ #endif // LDBL_MAX #ifndef FLT_TRUE_MIN -#define FLT_TRUE_MIN __FLT_TRUE_MIN__ +#define FLT_TRUE_MIN __FLT_DENORM_MIN__ #endif // FLT_TRUE_MIN #ifndef DBL_TRUE_MIN -#define DBL_TRUE_MIN __DBL_TRUE_MIN__ +#define DBL_TRUE_MIN __DBL_DENORM_MIN__ #endif // DBL_TRUE_MIN #ifndef LDBL_TRUE_MIN -#define LDBL_TRUE_MIN __LDBL_TRUE_MIN__ +#define LDBL_TRUE_MIN __LDBL_DENORM_MIN__ #endif // LDBL_TRUE_MIN #ifndef FLT_EPSILON diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt index bbc45650f3fca3..7718aeaa3de5af 100644 --- a/libc/src/__support/macros/properties/CMakeLists.txt +++ b/libc/src/__support/macros/properties/CMakeLists.txt @@ -33,6 +33,6 @@ add_header_library( .compiler .cpu_features .os - libc.include.llvm-libc-macros.float_macros + libc.hdr.float_macros libc.include.llvm-libc-types.float128 ) diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h index d43cf99e6859be..781cf1b7a2b627 100644 --- a/libc/src/__support/macros/properties/types.h +++ b/libc/src/__support/macros/properties/types.h @@ -10,7 +10,7 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H -#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG +#include "hdr/float_macros.h" // LDBL_MANT_DIG #include "include/llvm-libc-types/float128.h" // float128 #include "src/__support/macros/properties/architectures.h" #include "src/__support/macros/properties/compiler.h" diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index daaf505008ca11..269bc6be5d8343 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -2933,6 +2933,7 @@ add_entrypoint_object( HDRS ../scalbn.h DEPENDS + libc.hdr.float_macros libc.src.__support.FPUtil.manipulation_functions COMPILE_OPTIONS -O3 @@ -2945,6 +2946,7 @@ add_entrypoint_object( HDRS ../scalbnf.h DEPENDS + libc.hdr.float_macros libc.src.__support.FPUtil.manipulation_functions COMPILE_OPTIONS -O3 @@ -2957,6 +2959,7 @@ add_entrypoint_object( HDRS ../scalbnl.h DEPENDS + libc.hdr.float_macros libc.src.__support.FPUtil.manipulation_functions COMPILE_OPTIONS -O3 @@ -2969,6 +2972,7 @@ add_entrypoint_object( HDRS ../scalbnf128.h DEPENDS + libc.hdr.float_macros libc.src.__support.macros.properties.types libc.src.__support.FPUtil.manipulation_functions COMPILE_OPTIONS diff --git a/libc/src/math/generic/scalbn.cpp b/libc/src/math/generic/scalbn.cpp index 3908f5892f144f..207cce1550bc01 100644 --- a/libc/src/math/generic/scalbn.cpp +++ b/libc/src/math/generic/scalbn.cpp @@ -7,19 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/math/scalbn.h" +#include "hdr/float_macros.h" #include "src/__support/FPUtil/ManipulationFunctions.h" #include "src/__support/common.h" +#if FLT_RADIX != 2 +#error "FLT_RADIX != 2 is not supported." +#endif + namespace LIBC_NAMESPACE { LLVM_LIBC_FUNCTION(double, scalbn, (double x, int n)) { -#if !defined(__FLT_RADIX__) -#error __FLT_RADIX__ undefined. -#elif __FLT_RADIX__ != 2 -#error __FLT_RADIX__!=2, unimplemented. -#else return fputil::ldexp(x, n); -#endif } } // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/scalbnf.cpp b/libc/src/math/generic/scalbnf.cpp index 4a4fa86dcfd895..e478088d3ce5a5 100644 --- a/libc/src/math/generic/scalbnf.cpp +++ b/libc/src/math/generic/scalbnf.cpp @@ -7,19 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/math/scalbnf.h" +#include "hdr/float_macros.h" #include "src/__support/FPUtil/ManipulationFunctions.h" #include "src/__support/common.h" +#if FLT_RADIX != 2 +#error "FLT_RADIX != 2 is not supported." +#endif + namespace LIBC_NAMESPACE { LLVM_LIBC_FUNCTION(float, scalbnf, (float x, int n)) { -#if !defined(__FLT_RADIX__) -#error __FLT_RADIX__ undefined. -#elif __FLT_RADIX__ != 2 -#error __FLT_RADIX__!=2, unimplemented. -#else return fputil::ldexp(x, n); -#endif } } // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/scalbnf128.cpp b/libc/src/math/generic/scalbnf128.cpp index be3d29ed27e985..5fd59611d53de7 100644 --- a/libc/src/math/generic/scalbnf128.cpp +++ b/libc/src/math/generic/scalbnf128.cpp @@ -7,21 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/math/scalbnf128.h" +#include "hdr/float_macros.h" #include "src/__support/FPUtil/ManipulationFunctions.h" #include "src/__support/common.h" +#if FLT_RADIX != 2 +#error "FLT_RADIX != 2 is not supported." +#endif + namespace LIBC_NAMESPACE { LLVM_LIBC_FUNCTION(float128, scalbnf128, (float128 x, int n)) { -// TODO: should be switched to use `FLT_RADIX` in hdr/float_macros.h" instead -// see: https://github.com/llvm/llvm-project/issues/90496 -#if !defined(__FLT_RADIX__) -#error __FLT_RADIX__ undefined. -#elif __FLT_RADIX__ != 2 -#error __FLT_RADIX__!=2, unimplemented. -#else return fputil::ldexp(x, n); -#endif } } // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/scalbnl.cpp b/libc/src/math/generic/scalbnl.cpp index 681338ec01f078..1225a7ebaf572d 100644 --- a/libc/src/math/generic/scalbnl.cpp +++ b/libc/src/math/generic/scalbnl.cpp @@ -7,19 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/math/scalbnl.h" +#include "hdr/float_macros.h" #include "src/__support/FPUtil/ManipulationFunctions.h" #include "src/__support/common.h" +#if FLT_RADIX != 2 +#error "FLT_RADIX != 2 is not supported." +#endif + namespace LIBC_NAMESPACE { LLVM_LIBC_FUNCTION(long double, scalbnl, (long double x, int n)) { -#if !defined(__FLT_RADIX__) -#error __FLT_RADIX__ undefined. -#elif __FLT_RADIX__ != 2 -#error __FLT_RADIX__!=2, unimplemented. -#else return fputil::ldexp(x, n); -#endif } } // namespace LIBC_NAMESPACE diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 446499cf15d7b4..70ec3a48a5e2e3 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -127,6 +127,11 @@ libc_support_library( hdrs = ["hdr/time_macros.h"], ) +libc_support_library( + name = "hdr_float_macros", + hdrs = ["hdr/float_macros.h"], +) + ############################ Type Proxy Header Files ########################### libc_support_library( @@ -189,7 +194,7 @@ libc_support_library( ":__support_macros_properties_compiler", ":__support_macros_properties_cpu_features", ":__support_macros_properties_os", - ":llvm_libc_macros_float_macros", + ":hdr_float_macros", ":llvm_libc_types_float128", ], ) From 39e5036c0e22cea24df73d28746bb8fe0a117f9d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 28 May 2024 16:25:54 -0700 Subject: [PATCH 83/89] [SCEV] Add predicated version of getSymbolicMaxBackedgeTakenCount. (#93498) This patch adds a predicated version of getSymbolicMaxBackedgeTakenCount. The intended use for this is loop access analysis for loops with uncountable exits. When analyzing dependences and computing runtime checks, we need the smallest upper bound on the number of iterations. In terms of memory safety, it shouldn't matter if any uncomputable exits leave the loop, as long as we prove that there are no dependences given the minimum of the countable exits. The same should apply also for generating runtime checks. PR: https://github.com/llvm/llvm-project/pull/93498 --- llvm/include/llvm/Analysis/ScalarEvolution.h | 19 +++++++- llvm/lib/Analysis/ScalarEvolution.cpp | 48 +++++++++++++++++-- ...cated-symbolic-max-backedge-taken-count.ll | 6 +++ 3 files changed, 67 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 1d016b28347d27..72f3d945424963 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -912,6 +912,13 @@ class ScalarEvolution { return getBackedgeTakenCount(L, SymbolicMaximum); } + /// Similar to getSymbolicMaxBackedgeTakenCount, except it will add a set of + /// SCEV predicates to Predicates that are required to be true in order for + /// the answer to be correct. Predicates can be checked with run-time + /// checks and can be used to perform loop versioning. + const SCEV *getPredicatedSymbolicMaxBackedgeTakenCount( + const Loop *L, SmallVector &Predicates); + /// Return true if the backedge taken count is either the value returned by /// getConstantMaxBackedgeTakenCount or zero. bool isBackedgeTakenCountMaxOrZero(const Loop *L); @@ -1549,7 +1556,9 @@ class ScalarEvolution { ScalarEvolution *SE) const; /// Get the symbolic max backedge taken count for the loop. - const SCEV *getSymbolicMax(const Loop *L, ScalarEvolution *SE); + const SCEV * + getSymbolicMax(const Loop *L, ScalarEvolution *SE, + SmallVector *Predicates = nullptr); /// Get the symbolic max backedge taken count for the particular loop exit. const SCEV *getSymbolicMax(const BasicBlock *ExitingBlock, @@ -1746,7 +1755,7 @@ class ScalarEvolution { /// Similar to getBackedgeTakenInfo, but will add predicates as required /// with the purpose of returning complete information. - const BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L); + BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L); /// Compute the number of times the specified loop will iterate. /// If AllowPredicates is set, we will create new SCEV predicates as @@ -2311,6 +2320,9 @@ class PredicatedScalarEvolution { /// Get the (predicated) backedge count for the analyzed loop. const SCEV *getBackedgeTakenCount(); + /// Get the (predicated) symbolic max backedge count for the analyzed loop. + const SCEV *getSymbolicMaxBackedgeTakenCount(); + /// Adds a new predicate. void addPredicate(const SCEVPredicate &Pred); @@ -2379,6 +2391,9 @@ class PredicatedScalarEvolution { /// The backedge taken count. const SCEV *BackedgeCount = nullptr; + + /// The symbolic backedge taken count. + const SCEV *SymbolicMaxBackedgeCount = nullptr; }; template <> struct DenseMapInfo { diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index bb56b41fe15d58..e46d7183a2a359 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -8295,6 +8295,11 @@ const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L, llvm_unreachable("Invalid ExitCountKind!"); } +const SCEV *ScalarEvolution::getPredicatedSymbolicMaxBackedgeTakenCount( + const Loop *L, SmallVector &Preds) { + return getPredicatedBackedgeTakenInfo(L).getSymbolicMax(L, this, &Preds); +} + bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) { return getBackedgeTakenInfo(L).isConstantMaxOrZero(this); } @@ -8311,7 +8316,7 @@ static void PushLoopPHIs(const Loop *L, Worklist.push_back(&PN); } -const ScalarEvolution::BackedgeTakenInfo & +ScalarEvolution::BackedgeTakenInfo & ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) { auto &BTI = getBackedgeTakenInfo(L); if (BTI.hasFullInfo()) @@ -8644,9 +8649,9 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const { return getConstantMax(); } -const SCEV * -ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L, - ScalarEvolution *SE) { +const SCEV *ScalarEvolution::BackedgeTakenInfo::getSymbolicMax( + const Loop *L, ScalarEvolution *SE, + SmallVector *Predicates) { if (!SymbolicMax) { // Form an expression for the maximum exit count possible for this loop. We // merge the max and exact information to approximate a version of @@ -8661,6 +8666,12 @@ ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L, "We should only have known counts for exiting blocks that " "dominate latch!"); ExitCounts.push_back(ExitCount); + if (Predicates) + for (const auto *P : ENT.Predicates) + Predicates->push_back(P); + + assert((Predicates || ENT.hasAlwaysTruePredicate()) && + "Predicate should be always true!"); } } if (ExitCounts.empty()) @@ -13609,6 +13620,24 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE, P->print(OS, 4); } + Preds.clear(); + auto *PredSymbolicMax = + SE->getPredicatedSymbolicMaxBackedgeTakenCount(L, Preds); + if (SymbolicBTC != PredSymbolicMax) { + OS << "Loop "; + L->getHeader()->printAsOperand(OS, /*PrintType=*/false); + OS << ": "; + if (!isa(PredSymbolicMax)) { + OS << "Predicated symbolic max backedge-taken count is "; + PrintSCEVWithTypeHint(OS, PredSymbolicMax); + } else + OS << "Unpredictable predicated symbolic max backedge-taken count."; + OS << "\n"; + OS << " Predicates:\n"; + for (const auto *P : Preds) + P->print(OS, 4); + } + if (SE->hasLoopInvariantBackedgeTakenCount(L)) { OS << "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); @@ -14822,6 +14851,17 @@ const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() { return BackedgeCount; } +const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() { + if (!SymbolicMaxBackedgeCount) { + SmallVector Preds; + SymbolicMaxBackedgeCount = + SE.getPredicatedSymbolicMaxBackedgeTakenCount(&L, Preds); + for (const auto *P : Preds) + addPredicate(*P); + } + return SymbolicMaxBackedgeCount; +} + void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) { if (Preds->implies(&Pred)) return; diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll index d40416359b65c6..8dc79a54eb97a5 100644 --- a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll +++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll @@ -12,6 +12,9 @@ define void @test1(i64 %x, ptr %a, ptr %b) { ; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. ; CHECK-NEXT: symbolic max exit count for header: ***COULDNOTCOMPUTE*** ; CHECK-NEXT: symbolic max exit count for latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: ; entry: br label %header @@ -52,6 +55,9 @@ define void @test2(i64 %x, ptr %a) { ; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. ; CHECK-NEXT: symbolic max exit count for header: ***COULDNOTCOMPUTE*** ; CHECK-NEXT: symbolic max exit count for latch: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: ; entry: br label %header From 722a5fce589cea76a0baf89ce731477bae8cf4b8 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 28 May 2024 16:27:04 -0700 Subject: [PATCH 84/89] [WebAssembly] Add -wasm-enable-exnref option (#93597) This adds `-wasm-enable-exnref`, which will enable the new EH instructions using `exnref` (adopted in Oct 2023 CG meeting): https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md This option should be used with `-wasm-enable-eh`. --- .../WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp | 7 +++++++ .../WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h | 1 + llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp | 4 ++++ llvm/test/CodeGen/WebAssembly/eh-option-errors.ll | 3 +++ 4 files changed, 15 insertions(+) diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index e8f58a19d25e3b..71dfe1062956e3 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -54,6 +54,13 @@ cl::opt // setjmp/longjmp handling using wasm EH instrutions cl::opt WebAssembly::WasmEnableSjLj( "wasm-enable-sjlj", cl::desc("WebAssembly setjmp/longjmp handling")); +// Whether we use the new exnref Wasm EH proposal adopted on Oct 2023. +// Should be used with -wasm-enable-eh. +// Currently set to false by default, but will later change to true and then +// later can be removed after the legacy WAsm EH instructions are removed. +cl::opt WebAssembly::WasmEnableExnref( + "wasm-enable-exnref", cl::desc("WebAssembly exception handling (exnref)"), + cl::init(false)); static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/, const Triple &TT, diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index b7498cb4299452..7f1a5f616ed484 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -44,6 +44,7 @@ extern cl::opt WasmEnableEmEH; // asm.js-style EH extern cl::opt WasmEnableEmSjLj; // asm.js-style SjLJ extern cl::opt WasmEnableEH; // EH using Wasm EH instructions extern cl::opt WasmEnableSjLj; // SjLj using Wasm EH instructions +extern cl::opt WasmEnableExnref; // EH using new Wasm EH (exnref) enum OperandType { /// Basic block label in a branch construct. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 68126992ddcd72..fd92a35c2638a5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -385,6 +385,7 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { using WebAssembly::WasmEnableEH; using WebAssembly::WasmEnableEmEH; using WebAssembly::WasmEnableEmSjLj; +using WebAssembly::WasmEnableExnref; using WebAssembly::WasmEnableSjLj; static void basicCheckForEHAndSjLj(TargetMachine *TM) { @@ -401,6 +402,9 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) { if (WasmEnableEmEH && WasmEnableSjLj) report_fatal_error( "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj"); + if (WasmEnableExnref && !WasmEnableEH) + report_fatal_error( + "-wasm-enable-exnref should be used with -wasm-enable-eh"); // Here we make sure TargetOptions.ExceptionModel is the same as // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang diff --git a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll index 74d02ddc405d3f..52a6364e122589 100644 --- a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll +++ b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll @@ -9,6 +9,9 @@ target triple = "wasm32-unknown-unknown" ; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_SJLJ ; EM_EH_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj +; RUN: not --crash llc < %s -wasm-enable-exnref 2>&1 | FileCheck %s --check-prefix=WASM_EXNREF_ONLY +; WASM_EXNREF_ONLY: LLVM ERROR: -wasm-enable-exnref should be used with -wasm-enable-eh + ; RUN: not --crash llc < %s -wasm-enable-eh -exception-model=dwarf 2>&1 | FileCheck %s --check-prefix=EH_MODEL_DWARF ; EH_MODEL_DWARF: LLVM ERROR: -exception-model should be either 'none' or 'wasm' From 60bce6eab4d734b86f49b7638856eb8899bc89e8 Mon Sep 17 00:00:00 2001 From: Brendan Dahl Date: Tue, 28 May 2024 16:33:20 -0700 Subject: [PATCH 85/89] [WebAssembly] Implement all f16x8 binary instructions. (#93360) This reuses most of the code that was created for f32x4 and f64x2 binary instructions and tries to follow how they were implemented. add/sub/mul/div - use regular LL instructions min/max - use the minimum/maximum intrinsic, and also have builtins pmin/pmax - use the wasm.pmax/pmin intrinsics and also have builtins Specified at: https://github.com/WebAssembly/half-precision/blob/29a9b9462c9285d4ccc1a5dc39214ddfd1892658/proposals/half-precision/Overview.md --- .../clang/Basic/BuiltinsWebAssembly.def | 4 ++ clang/lib/CodeGen/CGBuiltin.cpp | 4 ++ clang/test/CodeGen/builtins-wasm.c | 24 +++++++ .../WebAssembly/WebAssemblyISelLowering.cpp | 5 ++ .../WebAssembly/WebAssemblyInstrSIMD.td | 43 +++++++++--- .../CodeGen/WebAssembly/half-precision.ll | 68 +++++++++++++++++++ llvm/test/MC/WebAssembly/simd-encodings.s | 24 +++++++ 7 files changed, 163 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index fd8c1b480d6da0..4e48ff48b60f5f 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -135,6 +135,10 @@ TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_min_f16x8, "V8hV8hV8h", "nc", "half-precision") +TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "half-precision") +TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "half-precision") +TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "half-precision") TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 5edf8c79709131..a3c65105033247 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20806,6 +20806,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, } case WebAssembly::BI__builtin_wasm_min_f32: case WebAssembly::BI__builtin_wasm_min_f64: + case WebAssembly::BI__builtin_wasm_min_f16x8: case WebAssembly::BI__builtin_wasm_min_f32x4: case WebAssembly::BI__builtin_wasm_min_f64x2: { Value *LHS = EmitScalarExpr(E->getArg(0)); @@ -20816,6 +20817,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, } case WebAssembly::BI__builtin_wasm_max_f32: case WebAssembly::BI__builtin_wasm_max_f64: + case WebAssembly::BI__builtin_wasm_max_f16x8: case WebAssembly::BI__builtin_wasm_max_f32x4: case WebAssembly::BI__builtin_wasm_max_f64x2: { Value *LHS = EmitScalarExpr(E->getArg(0)); @@ -20824,6 +20826,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType())); return Builder.CreateCall(Callee, {LHS, RHS}); } + case WebAssembly::BI__builtin_wasm_pmin_f16x8: case WebAssembly::BI__builtin_wasm_pmin_f32x4: case WebAssembly::BI__builtin_wasm_pmin_f64x2: { Value *LHS = EmitScalarExpr(E->getArg(0)); @@ -20832,6 +20835,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType())); return Builder.CreateCall(Callee, {LHS, RHS}); } + case WebAssembly::BI__builtin_wasm_pmax_f16x8: case WebAssembly::BI__builtin_wasm_pmax_f32x4: case WebAssembly::BI__builtin_wasm_pmax_f64x2: { Value *LHS = EmitScalarExpr(E->getArg(0)); diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index 93a6ab06081c99..d6ee4f68700dca 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -825,6 +825,30 @@ float extract_lane_f16x8(f16x8 a, int i) { // WEBASSEMBLY-NEXT: ret float %0 return __builtin_wasm_extract_lane_f16x8(a, i); } + +f16x8 min_f16x8(f16x8 a, f16x8 b) { + // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.minimum.v8f16(<8 x half> %a, <8 x half> %b) + // WEBASSEMBLY-NEXT: ret <8 x half> %0 + return __builtin_wasm_min_f16x8(a, b); +} + +f16x8 max_f16x8(f16x8 a, f16x8 b) { + // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.maximum.v8f16(<8 x half> %a, <8 x half> %b) + // WEBASSEMBLY-NEXT: ret <8 x half> %0 + return __builtin_wasm_max_f16x8(a, b); +} + +f16x8 pmin_f16x8(f16x8 a, f16x8 b) { + // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b) + // WEBASSEMBLY-NEXT: ret <8 x half> %0 + return __builtin_wasm_pmin_f16x8(a, b); +} + +f16x8 pmax_f16x8(f16x8 a, f16x8 b) { + // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b) + // WEBASSEMBLY-NEXT: ret <8 x half> %0 + return __builtin_wasm_pmax_f16x8(a, b); +} __externref_t externref_null() { return __builtin_wasm_ref_null_extern(); // WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern() diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index f9f16498bb390c..4beab9d091581b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -145,6 +145,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setTruncStoreAction(T, MVT::f16, Expand); } + if (Subtarget->hasHalfPrecision()) { + setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); + } + // Expand unavailable integer operations. for (auto Op : {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 558e3d859dcd84..baf15ccdbe9edb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -16,33 +16,34 @@ multiclass ABSTRACT_SIMD_I pattern_r, string asmstr_r, string asmstr_s, bits<32> simdop, - Predicate simd_level> { + list reqs> { defm "" : I, - Requires<[simd_level]>; + Requires; } multiclass SIMD_I pattern_r, string asmstr_r = "", - string asmstr_s = "", bits<32> simdop = -1> { + string asmstr_s = "", bits<32> simdop = -1, + list reqs = []> { defm "" : ABSTRACT_SIMD_I; + asmstr_s, simdop, !listconcat([HasSIMD128], reqs)>; } multiclass RELAXED_I pattern_r, string asmstr_r = "", string asmstr_s = "", bits<32> simdop = -1> { defm "" : ABSTRACT_SIMD_I; + asmstr_s, simdop, [HasRelaxedSIMD]>; } multiclass HALF_PRECISION_I pattern_r, string asmstr_r = "", string asmstr_s = "", bits<32> simdop = -1> { defm "" : ABSTRACT_SIMD_I; + asmstr_s, simdop, [HasHalfPrecision]>; } @@ -152,6 +153,19 @@ def F64x2 : Vec { let prefix = "f64x2"; } +def F16x8 : Vec { + let vt = v8f16; + let int_vt = v8i16; + let lane_vt = f32; + let lane_rc = F32; + let lane_bits = 16; + let lane_idx = LaneIdx8; + let lane_load = int_wasm_loadf16_f32; + let splat = PatFrag<(ops node:$x), (v8f16 (splat_vector (f16 $x)))>; + let prefix = "f16x8"; +} + +// TODO: Include F16x8 here when half precision is better supported. defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2]; defvar IntVecs = [I8x16, I16x8, I32x4, I64x2]; @@ -781,13 +795,19 @@ def : Pat<(v2i64 (nodes[0] (v2f64 V128:$lhs), (v2f64 V128:$rhs))), // Bitwise operations //===----------------------------------------------------------------------===// -multiclass SIMDBinary simdop> { +multiclass SIMDBinary simdop, list reqs = []> { defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins), [(set (vec.vt V128:$dst), (node (vec.vt V128:$lhs), (vec.vt V128:$rhs)))], vec.prefix#"."#name#"\t$dst, $lhs, $rhs", - vec.prefix#"."#name, simdop>; + vec.prefix#"."#name, simdop, reqs>; +} + +multiclass HalfPrecisionBinary simdop> { + defm "" : SIMDBinary; } multiclass SIMDBitwise simdop, @@ -1199,6 +1219,7 @@ def : Pat<(v2f64 (froundeven (v2f64 V128:$src))), (NEAREST_F64x2 V128:$src)>; multiclass SIMDBinaryFP baseInst> { defm "" : SIMDBinary; defm "" : SIMDBinary; + defm "" : HalfPrecisionBinary; } // Addition: add @@ -1242,7 +1263,7 @@ defm PMAX : SIMDBinaryFP; // Also match the pmin/pmax cases where the operands are int vectors (but the // comparison is still a floating point comparison). This can happen when using // the wasm_simd128.h intrinsics because v128_t is an integer vector. -foreach vec = [F32x4, F64x2] in { +foreach vec = [F32x4, F64x2, F16x8] in { defvar pmin = !cast("PMIN_"#vec); defvar pmax = !cast("PMAX_"#vec); def : Pat<(vec.int_vt (vselect @@ -1266,6 +1287,10 @@ def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))), (PMIN_F64x2 V128:$lhs, V128:$rhs)>; def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))), (PMAX_F64x2 V128:$lhs, V128:$rhs)>; +def : Pat<(v8f16 (int_wasm_pmin (v8f16 V128:$lhs), (v8f16 V128:$rhs))), + (PMIN_F16x8 V128:$lhs, V128:$rhs)>; +def : Pat<(v8f16 (int_wasm_pmax (v8f16 V128:$lhs), (v8f16 V128:$rhs))), + (PMAX_F16x8 V128:$lhs, V128:$rhs)>; //===----------------------------------------------------------------------===// // Conversions diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index d9d3f6be800fdd..73ccea8d652db8 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -35,3 +35,71 @@ define float @extract_lane_v8f16(<8 x half> %v) { %r = call float @llvm.wasm.extract.lane.f16x8(<8 x half> %v, i32 1) ret float %r } + +; CHECK-LABEL: add_v8f16: +; CHECK: f16x8.add $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +define <8 x half> @add_v8f16(<8 x half> %a, <8 x half> %b) { + %r = fadd <8 x half> %a, %b + ret <8 x half> %r +} + +; CHECK-LABEL: sub_v8f16: +; CHECK: f16x8.sub $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +define <8 x half> @sub_v8f16(<8 x half> %a, <8 x half> %b) { + %r = fsub <8 x half> %a, %b + ret <8 x half> %r +} + +; CHECK-LABEL: mul_v8f16: +; CHECK: f16x8.mul $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +define <8 x half> @mul_v8f16(<8 x half> %a, <8 x half> %b) { + %r = fmul <8 x half> %a, %b + ret <8 x half> %r +} + +; CHECK-LABEL: div_v8f16: +; CHECK: f16x8.div $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +define <8 x half> @div_v8f16(<8 x half> %a, <8 x half> %b) { + %r = fdiv <8 x half> %a, %b + ret <8 x half> %r +} + +; CHECK-LABEL: min_intrinsic_v8f16: +; CHECK: f16x8.min $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>) +define <8 x half> @min_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) { + %a = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y) + ret <8 x half> %a +} + +; CHECK-LABEL: max_intrinsic_v8f16: +; CHECK: f16x8.max $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>) +define <8 x half> @max_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) { + %a = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y) + ret <8 x half> %a +} + +; CHECK-LABEL: pmin_intrinsic_v8f16: +; CHECK: f16x8.pmin $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +declare <8 x half> @llvm.wasm.pmin.v8f16(<8 x half>, <8 x half>) +define <8 x half> @pmin_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) { + %v = call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b) + ret <8 x half> %v +} + +; CHECK-LABEL: pmax_intrinsic_v8f16: +; CHECK: f16x8.pmax $push0=, $0, $1 +; CHECK-NEXT: return $pop0 +declare <8 x half> @llvm.wasm.pmax.v8f16(<8 x half>, <8 x half>) +define <8 x half> @pmax_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) { + %v = call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b) + ret <8 x half> %v +} diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index d397188a9882ea..113a23da776fa9 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -851,4 +851,28 @@ main: # CHECK: f16x8.extract_lane 1 # encoding: [0xfd,0xa1,0x02,0x01] f16x8.extract_lane 1 + # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02] + f16x8.add + + # CHECK: f16x8.sub # encoding: [0xfd,0xb5,0x02] + f16x8.sub + + # CHECK: f16x8.mul # encoding: [0xfd,0xb6,0x02] + f16x8.mul + + # CHECK: f16x8.div # encoding: [0xfd,0xb7,0x02] + f16x8.div + + # CHECK: f16x8.min # encoding: [0xfd,0xb8,0x02] + f16x8.min + + # CHECK: f16x8.max # encoding: [0xfd,0xb9,0x02] + f16x8.max + + # CHECK: f16x8.pmin # encoding: [0xfd,0xba,0x02] + f16x8.pmin + + # CHECK: f16x8.pmax # encoding: [0xfd,0xbb,0x02] + f16x8.pmax + end_function From 0edc97f119f3ac3ff96b11183fe5c001a48a9a8d Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Tue, 28 May 2024 16:39:09 -0700 Subject: [PATCH 86/89] [IR][AArch64][PAC] Add "ptrauth(...)" Constant to represent signed pointers. (#85738) This defines a new kind of IR Constant that represents a ptrauth signed pointer, as used in AArch64 PAuth. It allows representing most kinds of signed pointer constants used thus far in the llvm ptrauth implementations, notably those used in the Darwin and ELF ABIs being implemented for c/c++. These signed pointer constants are then lowered to ELF/MachO relocations. These can be simply thought of as a constant `llvm.ptrauth.sign`, with the interesting addition of discriminator computation: the `ptrauth` constant can also represent a combined blend, when both address and integer discriminator operands are used. Both operands are otherwise optional, with default values 0/null. --- llvm/docs/LangRef.rst | 34 +++++ llvm/docs/PointerAuth.md | 22 ++++ llvm/include/llvm/AsmParser/LLToken.h | 1 + llvm/include/llvm/Bitcode/LLVMBitCodes.h | 1 + llvm/include/llvm/IR/Constants.h | 66 ++++++++++ llvm/include/llvm/IR/Value.def | 1 + llvm/lib/Analysis/ValueTracking.cpp | 4 + llvm/lib/AsmParser/LLLexer.cpp | 1 + llvm/lib/AsmParser/LLParser.cpp | 54 ++++++++ llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp | 1 + llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 25 +++- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 6 + llvm/lib/IR/AsmWriter.cpp | 21 +++ llvm/lib/IR/Constants.cpp | 121 ++++++++++++++++++ llvm/lib/IR/ConstantsContext.h | 47 +++++++ llvm/lib/IR/LLVMContextImpl.h | 2 + llvm/lib/IR/Verifier.cpp | 23 ++++ llvm/test/Assembler/invalid-ptrauth-const1.ll | 6 + llvm/test/Assembler/invalid-ptrauth-const2.ll | 6 + llvm/test/Assembler/invalid-ptrauth-const3.ll | 6 + llvm/test/Assembler/invalid-ptrauth-const4.ll | 6 + llvm/test/Assembler/invalid-ptrauth-const5.ll | 6 + llvm/test/Assembler/ptrauth-const.ll | 24 ++++ llvm/test/Bitcode/compatibility.ll | 4 + llvm/utils/vim/syntax/llvm.vim | 1 + 25 files changed, 488 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Assembler/invalid-ptrauth-const1.ll create mode 100644 llvm/test/Assembler/invalid-ptrauth-const2.ll create mode 100644 llvm/test/Assembler/invalid-ptrauth-const3.ll create mode 100644 llvm/test/Assembler/invalid-ptrauth-const4.ll create mode 100644 llvm/test/Assembler/invalid-ptrauth-const5.ll create mode 100644 llvm/test/Assembler/ptrauth-const.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 614dd98b013b35..7b64c477d13c7f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4754,6 +4754,40 @@ reference to the CFI jump table in the ``LowerTypeTests`` pass. These constants may be useful in low-level programs, such as operating system kernels, which need to refer to the actual function body. +.. _ptrauth_constant: + +Pointer Authentication Constants +-------------------------------- + +``ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?)`` + +A '``ptrauth``' constant represents a pointer with a cryptographic +authentication signature embedded into some bits, as described in the +`Pointer Authentication `__ document. + +A '``ptrauth``' constant is simply a constant equivalent to the +``llvm.ptrauth.sign`` intrinsic, potentially fed by a discriminator +``llvm.ptrauth.blend`` if needed. + +Its type is the same as the first argument. An integer constant discriminator +and an address discriminator may be optionally specified. Otherwise, they have +values ``i64 0`` and ``ptr null``. + +If the address discriminator is ``null`` then the expression is equivalent to + +.. code-block:: llvm + + %tmp = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 DISC) + %val = inttoptr i64 %tmp to ptr + +Otherwise, the expression is equivalent to: + +.. code-block:: llvm + + %tmp1 = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr ADDRDISC to i64), i64 DISC) + %tmp2 = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 %tmp1) + %val = inttoptr i64 %tmp2 to ptr + .. _constantexprs: Constant Expressions diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md index a8d2b4d8f5f0bd..cf2cc6305f130f 100644 --- a/llvm/docs/PointerAuth.md +++ b/llvm/docs/PointerAuth.md @@ -16,6 +16,7 @@ For more details, see the clang documentation page for At the IR level, it is represented using: * a [set of intrinsics](#intrinsics) (to sign/authenticate pointers) +* a [signed pointer constant](#constant) (to sign globals) * a [call operand bundle](#operand-bundle) (to authenticate called pointers) The current implementation leverages the @@ -225,6 +226,27 @@ with a pointer address discriminator, in a way that is specified by the target implementation. +### Constant + +[Intrinsics](#intrinsics) can be used to produce signed pointers dynamically, +in code, but not for signed pointers referenced by constants, in, e.g., global +initializers. + +The latter are represented using a +[``ptrauth`` constant](https://llvm.org/docs/LangRef.html#ptrauth-constant), +which describes an authenticated relocation producing a signed pointer. + +```llvm +ptrauth (ptr CST, i32 KEY, i64 DISC, ptr ADDRDISC) +``` + +is equivalent to: + +```llvm + %disc = call i64 @llvm.ptrauth.blend(i64 ptrtoint(ptr ADDRDISC to i64), i64 DISC) + %signedval = call i64 @llvm.ptrauth.sign(ptr CST, i32 KEY, i64 %disc) +``` + ### Operand Bundle Function pointers used as indirect call targets can be signed when materialized, diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index df61ec6ed30e0b..69821c22dcd619 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -346,6 +346,7 @@ enum Kind { kw_blockaddress, kw_dso_local_equivalent, kw_no_cfi, + kw_ptrauth, kw_freeze, diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index d3b9e96520f88a..9999aee61528e5 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -413,6 +413,7 @@ enum ConstantsCodes { // asmstr,conststr] CST_CODE_CE_GEP_WITH_INRANGE = 31, // [opty, flags, range, n x operands] CST_CODE_CE_GEP = 32, // [opty, flags, n x operands] + CST_CODE_PTRAUTH = 33, // [ptr, key, disc, addrdisc] }; /// CastOpcodes - These are values used in the bitcode files to encode which diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h index a1e5005a9d1da5..86f6be7985a23f 100644 --- a/llvm/include/llvm/IR/Constants.h +++ b/llvm/include/llvm/IR/Constants.h @@ -1008,6 +1008,72 @@ struct OperandTraits : public FixedNumOperandTraits { DEFINE_TRANSPARENT_OPERAND_ACCESSORS(NoCFIValue, Value) +/// A signed pointer, in the ptrauth sense. +class ConstantPtrAuth final : public Constant { + friend struct ConstantPtrAuthKeyType; + friend class Constant; + + ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc, + Constant *AddrDisc); + + void *operator new(size_t s) { return User::operator new(s, 4); } + + void destroyConstantImpl(); + Value *handleOperandChangeImpl(Value *From, Value *To); + +public: + /// Return a pointer signed with the specified parameters. + static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key, + ConstantInt *Disc, Constant *AddrDisc); + + /// Produce a new ptrauth expression signing the given value using + /// the same schema as is stored in one. + ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const; + + /// Transparently provide more efficient getOperand methods. + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant); + + /// The pointer that is signed in this ptrauth signed pointer. + Constant *getPointer() const { return cast(Op<0>().get()); } + + /// The Key ID, an i32 constant. + ConstantInt *getKey() const { return cast(Op<1>().get()); } + + /// The integer discriminator, an i64 constant, or 0. + ConstantInt *getDiscriminator() const { + return cast(Op<2>().get()); + } + + /// The address discriminator if any, or the null constant. + /// If present, this must be a value equivalent to the storage location of + /// the only global-initializer user of the ptrauth signed pointer. + Constant *getAddrDiscriminator() const { + return cast(Op<3>().get()); + } + + /// Whether there is any non-null address discriminator. + bool hasAddressDiscriminator() const { + return !getAddrDiscriminator()->isNullValue(); + } + + /// Check whether an authentication operation with key \p Key and (possibly + /// blended) discriminator \p Discriminator is known to be compatible with + /// this ptrauth signed pointer. + bool isKnownCompatibleWith(const Value *Key, const Value *Discriminator, + const DataLayout &DL) const; + + /// Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const Value *V) { + return V->getValueID() == ConstantPtrAuthVal; + } +}; + +template <> +struct OperandTraits + : public FixedNumOperandTraits {}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPtrAuth, Constant) + //===----------------------------------------------------------------------===// /// A constant value that is initialized with an expression using /// other constant values. diff --git a/llvm/include/llvm/IR/Value.def b/llvm/include/llvm/IR/Value.def index 61f7a87666d094..3ece66a529e125 100644 --- a/llvm/include/llvm/IR/Value.def +++ b/llvm/include/llvm/IR/Value.def @@ -81,6 +81,7 @@ HANDLE_CONSTANT(BlockAddress) HANDLE_CONSTANT(ConstantExpr) HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(DSOLocalEquivalent) HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(NoCFIValue) +HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(ConstantPtrAuth) // ConstantAggregate. HANDLE_CONSTANT(ConstantArray) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 3baa8ede28ffaf..08138a5e2f2d9d 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -3140,6 +3140,10 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, return true; } + // Constant ptrauth can be null, iff the base pointer can be. + if (auto *CPA = dyn_cast(V)) + return isKnownNonZero(CPA->getPointer(), DemandedElts, Q, Depth); + // A global variable in address space 0 is non null unless extern weak // or an absolute symbol reference. Other address spaces may have null as a // valid address for a global, so we can't assume anything. diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 20a1bd29577124..d3ab306904da12 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -710,6 +710,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(blockaddress); KEYWORD(dso_local_equivalent); KEYWORD(no_cfi); + KEYWORD(ptrauth); // Metadata types. KEYWORD(distinct); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 5d2056d2085672..df0827996396ef 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4046,6 +4046,60 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { ID.NoCFI = true; return false; } + case lltok::kw_ptrauth: { + // ValID ::= 'ptrauth' '(' ptr @foo ',' i32 + // (',' i64 (',' ptr addrdisc)? )? ')' + Lex.Lex(); + + Constant *Ptr, *Key; + Constant *Disc = nullptr, *AddrDisc = nullptr; + + if (parseToken(lltok::lparen, + "expected '(' in constant ptrauth expression") || + parseGlobalTypeAndValue(Ptr) || + parseToken(lltok::comma, + "expected comma in constant ptrauth expression") || + parseGlobalTypeAndValue(Key)) + return true; + // If present, parse the optional disc/addrdisc. + if (EatIfPresent(lltok::comma)) + if (parseGlobalTypeAndValue(Disc) || + (EatIfPresent(lltok::comma) && parseGlobalTypeAndValue(AddrDisc))) + return true; + if (parseToken(lltok::rparen, + "expected ')' in constant ptrauth expression")) + return true; + + if (!Ptr->getType()->isPointerTy()) + return error(ID.Loc, "constant ptrauth base pointer must be a pointer"); + + auto *KeyC = dyn_cast(Key); + if (!KeyC || KeyC->getBitWidth() != 32) + return error(ID.Loc, "constant ptrauth key must be i32 constant"); + + ConstantInt *DiscC = nullptr; + if (Disc) { + DiscC = dyn_cast(Disc); + if (!DiscC || DiscC->getBitWidth() != 64) + return error( + ID.Loc, + "constant ptrauth integer discriminator must be i64 constant"); + } else { + DiscC = ConstantInt::get(Type::getInt64Ty(Context), 0); + } + + if (AddrDisc) { + if (!AddrDisc->getType()->isPointerTy()) + return error( + ID.Loc, "constant ptrauth address discriminator must be a pointer"); + } else { + AddrDisc = ConstantPointerNull::get(PointerType::get(Context, 0)); + } + + ID.ConstantVal = ConstantPtrAuth::get(Ptr, KeyC, DiscC, AddrDisc); + ID.Kind = ValID::t_Constant; + return false; + } case lltok::kw_trunc: case lltok::kw_bitcast: diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp index c085c715179ba6..b7ed9cdf631454 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -222,6 +222,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID, STRINGIFY_CODE(CST_CODE, CE_UNOP) STRINGIFY_CODE(CST_CODE, DSO_LOCAL_EQUIVALENT) STRINGIFY_CODE(CST_CODE, NO_CFI_VALUE) + STRINGIFY_CODE(CST_CODE, PTRAUTH) case bitc::CST_CODE_BLOCKADDRESS: return "CST_CODE_BLOCKADDRESS"; STRINGIFY_CODE(CST_CODE, DATA) diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 32b9a033173e93..aee627bbde0bf5 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -517,7 +517,8 @@ class BitcodeConstant final : public Value, static constexpr uint8_t NoCFIOpcode = 252; static constexpr uint8_t DSOLocalEquivalentOpcode = 251; static constexpr uint8_t BlockAddressOpcode = 250; - static constexpr uint8_t FirstSpecialOpcode = BlockAddressOpcode; + static constexpr uint8_t ConstantPtrAuthOpcode = 249; + static constexpr uint8_t FirstSpecialOpcode = ConstantPtrAuthOpcode; // Separate struct to make passing different number of parameters to // BitcodeConstant::create() more convenient. @@ -1562,6 +1563,18 @@ Expected BitcodeReader::materializeValue(unsigned StartValID, C = ConstantExpr::get(BC->Opcode, ConstOps[0], ConstOps[1], BC->Flags); } else { switch (BC->Opcode) { + case BitcodeConstant::ConstantPtrAuthOpcode: { + auto *Key = dyn_cast(ConstOps[1]); + if (!Key) + return error("ptrauth key operand must be ConstantInt"); + + auto *Disc = dyn_cast(ConstOps[2]); + if (!Disc) + return error("ptrauth disc operand must be ConstantInt"); + + C = ConstantPtrAuth::get(ConstOps[0], Key, Disc, ConstOps[3]); + break; + } case BitcodeConstant::NoCFIOpcode: { auto *GV = dyn_cast(ConstOps[0]); if (!GV) @@ -3644,6 +3657,16 @@ Error BitcodeReader::parseConstants() { Record[1]); break; } + case bitc::CST_CODE_PTRAUTH: { + if (Record.size() < 4) + return error("Invalid ptrauth record"); + // Ptr, Key, Disc, AddrDisc + V = BitcodeConstant::create(Alloc, CurTy, + BitcodeConstant::ConstantPtrAuthOpcode, + {(unsigned)Record[0], (unsigned)Record[1], + (unsigned)Record[2], (unsigned)Record[3]}); + break; + } } assert(V->getType() == getTypeByID(CurTyID) && "Incorrect result type ID"); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 3d653fe4458f4b..046dad5721c4ce 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -2848,6 +2848,12 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, Code = bitc::CST_CODE_NO_CFI_VALUE; Record.push_back(VE.getTypeID(NC->getGlobalValue()->getType())); Record.push_back(VE.getValueID(NC->getGlobalValue())); + } else if (const auto *CPA = dyn_cast(C)) { + Code = bitc::CST_CODE_PTRAUTH; + Record.push_back(VE.getValueID(CPA->getPointer())); + Record.push_back(VE.getValueID(CPA->getKey())); + Record.push_back(VE.getValueID(CPA->getDiscriminator())); + Record.push_back(VE.getValueID(CPA->getAddrDiscriminator())); } else { #ifndef NDEBUG C->dump(); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index ced5d78f994ab5..8b1a21f962b08f 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1594,6 +1594,27 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, return; } + if (const ConstantPtrAuth *CPA = dyn_cast(CV)) { + Out << "ptrauth ("; + + // ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?) + unsigned NumOpsToWrite = 2; + if (!CPA->getOperand(2)->isNullValue()) + NumOpsToWrite = 3; + if (!CPA->getOperand(3)->isNullValue()) + NumOpsToWrite = 4; + + ListSeparator LS; + for (unsigned i = 0, e = NumOpsToWrite; i != e; ++i) { + Out << LS; + WriterCtx.TypePrinter->print(CPA->getOperand(i)->getType(), Out); + Out << ' '; + WriteAsOperandInternal(Out, CPA->getOperand(i), WriterCtx); + } + Out << ')'; + return; + } + if (const ConstantArray *CA = dyn_cast(CV)) { Type *ETy = CA->getType()->getElementType(); Out << '['; diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index cfb89d557db479..119fcb4fa03461 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -550,6 +550,9 @@ void llvm::deleteConstant(Constant *C) { case Constant::NoCFIValueVal: delete static_cast(C); break; + case Constant::ConstantPtrAuthVal: + delete static_cast(C); + break; case Constant::UndefValueVal: delete static_cast(C); break; @@ -2015,6 +2018,124 @@ Value *NoCFIValue::handleOperandChangeImpl(Value *From, Value *To) { return nullptr; } +//---- ConstantPtrAuth::get() implementations. +// + +ConstantPtrAuth *ConstantPtrAuth::get(Constant *Ptr, ConstantInt *Key, + ConstantInt *Disc, Constant *AddrDisc) { + Constant *ArgVec[] = {Ptr, Key, Disc, AddrDisc}; + ConstantPtrAuthKeyType MapKey(ArgVec); + LLVMContextImpl *pImpl = Ptr->getContext().pImpl; + return pImpl->ConstantPtrAuths.getOrCreate(Ptr->getType(), MapKey); +} + +ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const { + return get(Pointer, getKey(), getDiscriminator(), getAddrDiscriminator()); +} + +ConstantPtrAuth::ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, + ConstantInt *Disc, Constant *AddrDisc) + : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, &Op<0>(), 4) { + assert(Ptr->getType()->isPointerTy()); + assert(Key->getBitWidth() == 32); + assert(Disc->getBitWidth() == 64); + assert(AddrDisc->getType()->isPointerTy()); + setOperand(0, Ptr); + setOperand(1, Key); + setOperand(2, Disc); + setOperand(3, AddrDisc); +} + +/// Remove the constant from the constant table. +void ConstantPtrAuth::destroyConstantImpl() { + getType()->getContext().pImpl->ConstantPtrAuths.remove(this); +} + +Value *ConstantPtrAuth::handleOperandChangeImpl(Value *From, Value *ToV) { + assert(isa(ToV) && "Cannot make Constant refer to non-constant!"); + Constant *To = cast(ToV); + + SmallVector Values; + Values.reserve(getNumOperands()); + + unsigned NumUpdated = 0; + + Use *OperandList = getOperandList(); + unsigned OperandNo = 0; + for (Use *O = OperandList, *E = OperandList + getNumOperands(); O != E; ++O) { + Constant *Val = cast(O->get()); + if (Val == From) { + OperandNo = (O - OperandList); + Val = To; + ++NumUpdated; + } + Values.push_back(Val); + } + + return getContext().pImpl->ConstantPtrAuths.replaceOperandsInPlace( + Values, this, From, To, NumUpdated, OperandNo); +} + +bool ConstantPtrAuth::isKnownCompatibleWith(const Value *Key, + const Value *Discriminator, + const DataLayout &DL) const { + // If the keys are different, there's no chance for this to be compatible. + if (getKey() != Key) + return false; + + // We can have 3 kinds of discriminators: + // - simple, integer-only: `i64 x, ptr null` vs. `i64 x` + // - address-only: `i64 0, ptr p` vs. `ptr p` + // - blended address/integer: `i64 x, ptr p` vs. `@llvm.ptrauth.blend(p, x)` + + // If this constant has a simple discriminator (integer, no address), easy: + // it's compatible iff the provided full discriminator is also a simple + // discriminator, identical to our integer discriminator. + if (!hasAddressDiscriminator()) + return getDiscriminator() == Discriminator; + + // Otherwise, we can isolate address and integer discriminator components. + const Value *AddrDiscriminator = nullptr; + + // This constant may or may not have an integer discriminator (instead of 0). + if (!getDiscriminator()->isNullValue()) { + // If it does, there's an implicit blend. We need to have a matching blend + // intrinsic in the provided full discriminator. + if (!match(Discriminator, + m_Intrinsic( + m_Value(AddrDiscriminator), m_Specific(getDiscriminator())))) + return false; + } else { + // Otherwise, interpret the provided full discriminator as address-only. + AddrDiscriminator = Discriminator; + } + + // Either way, we can now focus on comparing the address discriminators. + + // Discriminators are i64, so the provided addr disc may be a ptrtoint. + if (auto *Cast = dyn_cast(AddrDiscriminator)) + AddrDiscriminator = Cast->getPointerOperand(); + + // Beyond that, we're only interested in compatible pointers. + if (getAddrDiscriminator()->getType() != AddrDiscriminator->getType()) + return false; + + // These are often the same constant GEP, making them trivially equivalent. + if (getAddrDiscriminator() == AddrDiscriminator) + return true; + + // Finally, they may be equivalent base+offset expressions. + APInt Off1(DL.getIndexTypeSizeInBits(getAddrDiscriminator()->getType()), 0); + auto *Base1 = getAddrDiscriminator()->stripAndAccumulateConstantOffsets( + DL, Off1, /*AllowNonInbounds=*/true); + + APInt Off2(DL.getIndexTypeSizeInBits(AddrDiscriminator->getType()), 0); + auto *Base2 = AddrDiscriminator->stripAndAccumulateConstantOffsets( + DL, Off2, /*AllowNonInbounds=*/true); + + return Base1 == Base2 && Off1 == Off2; +} + //---- ConstantExpr::get() implementations. // diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index 7067d0d121117b..5153880b5cab64 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -23,6 +23,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -286,6 +287,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value) template struct ConstantAggrKeyType; struct InlineAsmKeyType; struct ConstantExprKeyType; +struct ConstantPtrAuthKeyType; template struct ConstantInfo; template <> struct ConstantInfo { @@ -308,6 +310,10 @@ template <> struct ConstantInfo { using ValType = ConstantAggrKeyType; using TypeClass = VectorType; }; +template <> struct ConstantInfo { + using ValType = ConstantPtrAuthKeyType; + using TypeClass = Type; +}; template struct ConstantAggrKeyType { ArrayRef Operands; @@ -536,6 +542,47 @@ struct ConstantExprKeyType { } }; +struct ConstantPtrAuthKeyType { + ArrayRef Operands; + + ConstantPtrAuthKeyType(ArrayRef Operands) : Operands(Operands) {} + + ConstantPtrAuthKeyType(ArrayRef Operands, const ConstantPtrAuth *) + : Operands(Operands) {} + + ConstantPtrAuthKeyType(const ConstantPtrAuth *C, + SmallVectorImpl &Storage) { + assert(Storage.empty() && "Expected empty storage"); + for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I) + Storage.push_back(cast(C->getOperand(I))); + Operands = Storage; + } + + bool operator==(const ConstantPtrAuthKeyType &X) const { + return Operands == X.Operands; + } + + bool operator==(const ConstantPtrAuth *C) const { + if (Operands.size() != C->getNumOperands()) + return false; + for (unsigned I = 0, E = Operands.size(); I != E; ++I) + if (Operands[I] != C->getOperand(I)) + return false; + return true; + } + + unsigned getHash() const { + return hash_combine_range(Operands.begin(), Operands.end()); + } + + using TypeClass = typename ConstantInfo::TypeClass; + + ConstantPtrAuth *create(TypeClass *Ty) const { + return new ConstantPtrAuth(Operands[0], cast(Operands[1]), + cast(Operands[2]), Operands[3]); + } +}; + // Free memory for a given constant. Assumes the constant has already been // removed from all relevant maps. void deleteConstant(Constant *C); diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index 399fe0dad26c73..392e0d16f1761e 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -1562,6 +1562,8 @@ class LLVMContextImpl { DenseMap NoCFIValues; + ConstantUniqueMap ConstantPtrAuths; + ConstantUniqueMap ExprConstants; ConstantUniqueMap InlineAsms; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 50f8d6ec842017..684e54444621b5 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -629,6 +629,7 @@ class Verifier : public InstVisitor, VerifierSupport { void visitConstantExprsRecursively(const Constant *EntryC); void visitConstantExpr(const ConstantExpr *CE); + void visitConstantPtrAuth(const ConstantPtrAuth *CPA); void verifyInlineAsmCall(const CallBase &Call); void verifyStatepoint(const CallBase &Call); void verifyFrameRecoverIndices(); @@ -2422,6 +2423,9 @@ void Verifier::visitConstantExprsRecursively(const Constant *EntryC) { if (const auto *CE = dyn_cast(C)) visitConstantExpr(CE); + if (const auto *CPA = dyn_cast(C)) + visitConstantPtrAuth(CPA); + if (const auto *GV = dyn_cast(C)) { // Global Values get visited separately, but we do need to make sure // that the global value is in the correct module @@ -2449,6 +2453,23 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) { "Invalid bitcast", CE); } +void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) { + Check(CPA->getPointer()->getType()->isPointerTy(), + "signed ptrauth constant base pointer must have pointer type"); + + Check(CPA->getType() == CPA->getPointer()->getType(), + "signed ptrauth constant must have same type as its base pointer"); + + Check(CPA->getKey()->getBitWidth() == 32, + "signed ptrauth constant key must be i32 constant integer"); + + Check(CPA->getAddrDiscriminator()->getType()->isPointerTy(), + "signed ptrauth constant address discriminator must be a pointer"); + + Check(CPA->getDiscriminator()->getBitWidth() == 64, + "signed ptrauth constant discriminator must be i64 constant integer"); +} + bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) { // There shouldn't be more attribute sets than there are parameters plus the // function and return value. @@ -5090,6 +5111,8 @@ void Verifier::visitInstruction(Instruction &I) { } else if (isa(I.getOperand(i))) { Check(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i), "Cannot take the address of an inline asm!", &I); + } else if (auto *CPA = dyn_cast(I.getOperand(i))) { + visitConstantExprsRecursively(CPA); } else if (ConstantExpr *CE = dyn_cast(I.getOperand(i))) { if (CE->getType()->isPtrOrPtrVectorTy()) { // If we have a ConstantExpr pointer, we need to see if it came from an diff --git a/llvm/test/Assembler/invalid-ptrauth-const1.ll b/llvm/test/Assembler/invalid-ptrauth-const1.ll new file mode 100644 index 00000000000000..fba2e230782382 --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const1.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth base pointer must be a pointer +@auth_var = global ptr ptrauth (i32 42, i32 0) diff --git a/llvm/test/Assembler/invalid-ptrauth-const2.ll b/llvm/test/Assembler/invalid-ptrauth-const2.ll new file mode 100644 index 00000000000000..4499c42601c99e --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const2.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth key must be i32 constant +@auth_var = global ptr ptrauth (ptr @var, i32 ptrtoint (ptr @var to i32)) diff --git a/llvm/test/Assembler/invalid-ptrauth-const3.ll b/llvm/test/Assembler/invalid-ptrauth-const3.ll new file mode 100644 index 00000000000000..3f2688d92a0010 --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const3.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth address discriminator must be a pointer +@auth_var = global ptr ptrauth (ptr @var, i32 2, i64 65535, i8 0) diff --git a/llvm/test/Assembler/invalid-ptrauth-const4.ll b/llvm/test/Assembler/invalid-ptrauth-const4.ll new file mode 100644 index 00000000000000..843a220458a61b --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const4.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth integer discriminator must be i64 constant +@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr null, i64 ptrtoint (ptr @var to i64)) diff --git a/llvm/test/Assembler/invalid-ptrauth-const5.ll b/llvm/test/Assembler/invalid-ptrauth-const5.ll new file mode 100644 index 00000000000000..9b47f6f5f423fc --- /dev/null +++ b/llvm/test/Assembler/invalid-ptrauth-const5.ll @@ -0,0 +1,6 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +@var = global i32 0 + +; CHECK: error: constant ptrauth integer discriminator must be i64 constant +@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr @var)) diff --git a/llvm/test/Assembler/ptrauth-const.ll b/llvm/test/Assembler/ptrauth-const.ll new file mode 100644 index 00000000000000..94d35146d5927b --- /dev/null +++ b/llvm/test/Assembler/ptrauth-const.ll @@ -0,0 +1,24 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +@var = global i32 0 + +; CHECK: @basic = global ptr ptrauth (ptr @var, i32 0) +@basic = global ptr ptrauth (ptr @var, i32 0) + +; CHECK: @keyed = global ptr ptrauth (ptr @var, i32 3) +@keyed = global ptr ptrauth (ptr @var, i32 3) + +; CHECK: @intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1) +@intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1) + +; CHECK: @addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc) +@addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc) + + +@var1 = addrspace(1) global i32 0 + +; CHECK: @addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0) +@addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0) + +; CHECK: @addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc) +@addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc) diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll index b374924516d665..2a846e036924c7 100644 --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -217,6 +217,10 @@ declare void @g.f1() ; CHECK: @g.sanitize_address_dyninit = global i32 0, sanitize_address_dyninit ; CHECK: @g.sanitize_multiple = global i32 0, sanitize_memtag, sanitize_address_dyninit +; ptrauth constant +@auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535, ptr null) +; CHECK: @auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535) + ;; Aliases ; Format: @ = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal] ; [unnamed_addr] alias @ diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim index d86e3d1ddbc27f..905d696400ca37 100644 --- a/llvm/utils/vim/syntax/llvm.vim +++ b/llvm/utils/vim/syntax/llvm.vim @@ -150,6 +150,7 @@ syn keyword llvmKeyword \ preallocated \ private \ protected + \ ptrauth \ ptx_device \ ptx_kernel \ readnone From 6f529aaf666624c26715aa348955b26a684d1250 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 28 May 2024 23:37:40 +0000 Subject: [PATCH 87/89] [WebAssembly] Remove IIT_EXNREF This was added in #93586 but caused a compilation warning and is not used anyway. --- llvm/include/llvm/IR/Intrinsics.td | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index c3ac53837444ef..107442623ab7bd 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -316,7 +316,6 @@ def IIT_PPCF128 : IIT_VT; def IIT_V3 : IIT_Vec<3, 53>; def IIT_EXTERNREF : IIT_VT; def IIT_FUNCREF : IIT_VT; -def IIT_EXNREF: IIT_VT; def IIT_I2 : IIT_Int<2, 57>; def IIT_I4 : IIT_Int<4, 58>; def IIT_AARCH64_SVCOUNT : IIT_VT; From bd5cd4b837b67f8d549f072f37dd09295b4bf9f7 Mon Sep 17 00:00:00 2001 From: Eric Fiselier Date: Tue, 28 May 2024 20:01:47 -0400 Subject: [PATCH 88/89] Fix trigger for libc++ job rerunner. Testing github actions is such a pain. I swear it should match now. --- .github/workflows/restart-preempted-libcxx-jobs.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml index 5682b0a4f52c3d..88924fb3cd7791 100644 --- a/.github/workflows/restart-preempted-libcxx-jobs.yaml +++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml @@ -11,18 +11,16 @@ name: Restart Preempted Libc++ Workflow on: workflow_run: - workflows: - - Build and Test libc\+\+ + workflows: [Build and Test libc\+\+] types: - - failure - - canceled + - completed permissions: contents: read jobs: restart: - if: github.repository_owner == 'llvm' + if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure' || github.event.workflow_run.conclusion == 'cancelled') name: "Restart Job" permissions: statuses: read From 5bfe4b93e15ad38f211c5dec64be0eeaa4c8e914 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Tue, 28 May 2024 20:04:41 -0400 Subject: [PATCH 89/89] [mlir][arith] Disallow casting tensor dimensions (#93349) Tighten the verifier for arith cast ops to disallow changing tensor dimensions, e.g., static to dynamic. After this change: * `arith.cast_op %x : tensor<4xi32> to tensor<4xf32>` remains valid * `arith.cast_op %x : tensor<4xi32> to tensor` becomes invalid * `arith.cast_op %x : tensor to tensor<4xf32>` becomes invalid This is mostly to simplify the op semantics. See the discussion thread for more context: https://discourse.llvm.org/t/rfc-remove-arith-math-ops-on-tensors/74357/63. --- .../include/mlir/Dialect/Arith/IR/ArithOps.td | 19 +++++++-- mlir/test/Dialect/Arith/canonicalize.mlir | 8 ---- mlir/test/Dialect/Arith/invalid.mlir | 42 ++++++++++++++++++- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td index 46248dad3be9e0..81ed0f924a2e2c 100644 --- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td +++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td @@ -83,12 +83,25 @@ class Arith_FloatBinaryOp traits = []> : attr-dict `:` type($result) }]; } +// Checks that tensor input and outputs have identical shapes. This is stricker +// than the verification done in `SameOperandsAndResultShape` that allows for +// tensor dimensions to be 'compatible' (e.g., dynamic dimensions being +// compatible with static ones). +def SameInputOutputTensorDims : PredOpTrait< + "input and output have the same tensor dimensions", + AllMatchSameOperatorPred<["in", "out"], + "(::llvm::isa<::mlir::TensorType>($_self.getType()) ?" + " ::llvm::cast<::mlir::TensorType>($_self.getType()).getShape() :" + " ::llvm::ArrayRef{})">>; + // Base class for arithmetic cast operations. Requires a single operand and -// result. If either is a shaped type, then the other must be of the same shape. +// result. If either is a shaped type, then the other must be of the same +// shape. In the case of tensor types, this also includes the corresponding +// operand/result dimensions being equal. class Arith_CastOp traits = []> : Arith_Op]>, + SameInputOutputTensorDims, DeclareOpInterfaceMethods]>, Arguments<(ins From:$in)>, Results<(outs To:$out)> { let assemblyFormat = "$in attr-dict `:` type($in) `to` type($out)"; @@ -1231,7 +1244,7 @@ def Arith_TruncIOp : Arith_IToICastOp<"trunci"> { def Arith_TruncFOp : Arith_Op<"truncf", - [Pure, SameOperandsAndResultShape, + [Pure, SameOperandsAndResultShape, SameInputOutputTensorDims, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]>, Arguments<(ins FloatLike:$in, diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir index 1a387c20c4b297..e4f95bb0545a20 100644 --- a/mlir/test/Dialect/Arith/canonicalize.mlir +++ b/mlir/test/Dialect/Arith/canonicalize.mlir @@ -2950,14 +2950,6 @@ func.func @unsignedExtendConstantResource() -> tensor { return %ext : tensor } -// Just checks that this doesn't crash. -// CHECK-LABEL: @signedExtendSplatAsDynamicShape -func.func @signedExtendSplatAsDynamicShape() -> tensor { - %splat = arith.constant dense<5> : tensor<2xi16> - %extsplat = arith.extsi %splat : tensor<2xi16> to tensor - return %extsplat : tensor -} - // CHECK-LABEL: @extsi_i0 // CHECK: %[[ZERO:.*]] = arith.constant 0 : i16 // CHECK: return %[[ZERO]] : i16 diff --git a/mlir/test/Dialect/Arith/invalid.mlir b/mlir/test/Dialect/Arith/invalid.mlir index ada849220bb839..652aa738ad3924 100644 --- a/mlir/test/Dialect/Arith/invalid.mlir +++ b/mlir/test/Dialect/Arith/invalid.mlir @@ -1,13 +1,21 @@ // RUN: mlir-opt -split-input-file %s -verify-diagnostics func.func @test_index_cast_shape_error(%arg0 : tensor) -> tensor<2xi64> { - // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}} + // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}} %0 = arith.index_cast %arg0 : tensor to tensor<2xi64> return %0 : tensor<2xi64> } // ----- +func.func @test_index_cast_shape_dim_error(%arg0 : tensor<2xindex>) -> tensor { + // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.index_cast %arg0 : tensor<2xindex> to tensor + return %0 : tensor +} + +// ----- + func.func @test_index_cast_tensor_error(%arg0 : tensor) -> i64 { // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}} %0 = arith.index_cast %arg0 : tensor to i64 @@ -655,6 +663,14 @@ func.func @extsi_scalable_to_fl(%arg0 : vector<[4]xi32>) { // ----- +func.func @extsi_tensor_dim(%arg0 : tensor<4xi32>) { + // expected-error@+1 {{'arith.extsi' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.extsi %arg0 : tensor<4xi32> to tensor + return +} + +// ----- + func.func @extf_scalable_to_fl(%arg0 : vector<[4]xf32>) { // expected-error@+1 {{'arith.extf' op requires the same shape for all operands and results}} %0 = arith.extf %arg0 : vector<[4]xf32> to vector<4xf64> @@ -703,6 +719,22 @@ func.func @bitcast_scalable_to_fl(%arg0 : vector<[4]xf32>) { // ----- +func.func @bitcast_tensor_dim(%arg0 : tensor<4xf32>) { + // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.bitcast %arg0 : tensor<4xf32> to tensor + return +} + +// ----- + +func.func @bitcast_tensor_dim(%arg0 : tensor) { + // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.bitcast %arg0 : tensor to tensor<4xi32> + return +} + +// ----- + func.func @trunci_fl_to_scalable(%arg0 : vector<4xi32>) { // expected-error@+1 {{'arith.trunci' op requires the same shape for all operands and results}} %0 = arith.trunci %arg0 : vector<4xi32> to vector<[4]xi8> @@ -719,6 +751,14 @@ func.func @truncf_fl_to_scalable(%arg0 : vector<4xf64>) { // ----- +func.func @truncf_tensor_dim(%arg0 : tensor<4xf64>) { + // expected-error@+1 {{'arith.truncf' op failed to verify that input and output have the same tensor dimensions}} + %0 = arith.truncf %arg0 : tensor<4xf64> to tensor + return +} + +// ----- + func.func @extui_fl_to_scalable(%arg0 : vector<4xi32>) { // expected-error@+1 {{'arith.extui' op requires the same shape for all operands and results}} %0 = arith.extui %arg0 : vector<4xi32> to vector<[4]xi64>