Skip to content

Commit 1d3c822

Browse files
authored
[Mono] Enable the supported V128 SIMD intrinsics on Arm64 across all codegen engines (#84289)
* Enable the supported ones * Add supporte for Create* and fix a bug * Fix CreateScalar for floating types * Fix create* * Address review feedback
1 parent e3c3700 commit 1d3c822

File tree

5 files changed

+194
-128
lines changed

5 files changed

+194
-128
lines changed

src/mono/mono/arch/arm64/arm64-codegen.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1111,8 +1111,8 @@ arm_encode_arith_imm (int imm, guint32 *shift)
11111111
/* NEON :: extract */
11121112
#define arm_neon_extr_opcode(p, q, op2, imm4, rd, rn, rm) arm_neon_opcode_3reg ((p), (q), 0b00101110000000000000000000000000 | (op2) << 22 | (imm4) << 11, (rd), (rn), (rm))
11131113

1114-
#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rd))
1115-
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rd))
1114+
#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rm))
1115+
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rm))
11161116

11171117
/* NEON :: copy */
11181118
#define arm_neon_cpy_opcode(p, q, op, imm5, imm4, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110000000000000010000000000 | (op) << 29 | (imm5) << 16 | (imm4) << 11, (rd), (rn))

src/mono/mono/mini/cpu-arm64.mdesc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,16 @@ expand_i4: dest:x src1:i len:4
521521
expand_i8: dest:x src1:i len:4
522522
expand_r4: dest:x src1:f len:4
523523
expand_r8: dest:x src1:f len:4
524+
insert_i1: dest:x src1:i len:4
525+
insert_i2: dest:x src1:i len:4
526+
insert_i4: dest:x src1:i len:4
527+
insert_i8: dest:x src1:i len:4
528+
insert_r4: dest:x src1:f len:4
529+
insert_r8: dest:x src1:f len:4
530+
create_scalar_int: dest:x src1:i len:8
531+
create_scalar_float: dest:x src1:f len:12
532+
create_scalar_unsafe_int: dest:x src1:i len:4
533+
create_scalar_unsafe_float: dest:x src1:f len:4
524534

525535
generic_class_init: src1:a len:44 clob:c
526536
gc_safe_point: src1:i len:12 clob:c

src/mono/mono/mini/mini-arm64.c

Lines changed: 121 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -3717,6 +3717,48 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
37173717
}
37183718
break;
37193719
}
3720+
/* SIMD that is not table-generated */
3721+
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
3722+
* move the following two to the codegen table in simd-arm64.h
3723+
*/
3724+
case OP_ONES_COMPLEMENT:
3725+
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
3726+
break;
3727+
case OP_NEGATION:
3728+
if (is_type_float_macro (ins->inst_c1)) {
3729+
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
3730+
} else {
3731+
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
3732+
}
3733+
break;
3734+
case OP_XBINOP:
3735+
switch (ins->inst_c0) {
3736+
case OP_IMAX:
3737+
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
3738+
break;
3739+
case OP_IMAX_UN:
3740+
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
3741+
break;
3742+
case OP_IMIN:
3743+
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
3744+
break;
3745+
case OP_IMIN_UN:
3746+
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
3747+
break;
3748+
default:
3749+
g_assert_not_reached ();
3750+
}
3751+
break;
3752+
case OP_XZERO:
3753+
arm_neon_eor_16b (code, dreg, dreg, dreg);
3754+
break;
3755+
case OP_XONES:
3756+
arm_neon_eor_16b (code, dreg, dreg, dreg);
3757+
arm_neon_not_16b (code, dreg, dreg);
3758+
break;
3759+
case OP_XEXTRACT:
3760+
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
3761+
break;
37203762
case OP_STOREX_MEMBASE:
37213763
code = emit_strfpq (code, sreg1, dreg, ins->inst_offset);
37223764
break;
@@ -3730,10 +3772,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
37303772
if (cfg->compile_aot && cfg->code_exec_only) {
37313773
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
37323774
arm_ldrx_lit (code, ARMREG_IP0, 0);
3733-
arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0);
3775+
arm_ldrfpq (code, dreg, ARMREG_IP0, 0);
37343776
} else {
37353777
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
3736-
arm_neon_ldrq_lit (code, ins->dreg, 0);
3778+
arm_neon_ldrq_lit (code, dreg, 0);
37373779
}
37383780
break;
37393781
}
@@ -3744,13 +3786,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
37443786
case OP_EXPAND_I4:
37453787
case OP_EXPAND_I8: {
37463788
const int t = get_type_size_macro (ins->inst_c1);
3747-
arm_neon_dup_g (code, VREG_FULL, t, ins->dreg, ins->sreg1);
3789+
arm_neon_dup_g (code, VREG_FULL, t, dreg, sreg1);
37483790
break;
37493791
}
37503792
case OP_EXPAND_R4:
37513793
case OP_EXPAND_R8: {
37523794
const int t = get_type_size_macro (ins->inst_c1);
3753-
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, 0);
3795+
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, 0);
37543796
break;
37553797
}
37563798
case OP_EXTRACT_I1:
@@ -3760,9 +3802,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
37603802
const int t = get_type_size_macro (ins->inst_c1);
37613803
// smov is not defined for i64
37623804
if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) {
3763-
arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
3805+
arm_neon_umov (code, t, dreg, sreg1, ins->inst_c0);
37643806
} else {
3765-
arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
3807+
arm_neon_smov (code, t, dreg, sreg1, ins->inst_c0);
37663808
}
37673809
break;
37683810
}
@@ -3773,17 +3815,39 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
37733815
// Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should
37743816
// set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest
37753817
// of the F/XREG is ignored in FREG mode, this operation remains valid.
3776-
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0);
3818+
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, ins->inst_c0);
37773819
}
37783820
break;
3821+
case OP_INSERT_I1:
3822+
case OP_INSERT_I2:
3823+
case OP_INSERT_I4:
3824+
case OP_INSERT_I8: {
3825+
const int t = get_type_size_macro (ins->inst_c1);
3826+
arm_neon_ins_g(code, t, dreg, sreg1, ins->inst_c0);
3827+
break;
3828+
}
3829+
case OP_INSERT_R4:
3830+
case OP_INSERT_R8: {
3831+
int t = 0;
3832+
switch (ins->inst_c1) {
3833+
case MONO_TYPE_R4:
3834+
t = SIZE_4;
3835+
break;
3836+
case MONO_TYPE_R8:
3837+
t = SIZE_8;
3838+
break;
3839+
}
3840+
arm_neon_ins_e(code, t, dreg, sreg1, ins->inst_c0, 0);
3841+
break;
3842+
}
37793843
case OP_ARM64_XADDV: {
37803844
switch (ins->inst_c0) {
37813845
case INTRINS_AARCH64_ADV_SIMD_FADDV:
37823846
if (ins->inst_c1 == MONO_TYPE_R8) {
3783-
arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1);
3847+
arm_neon_faddp (code, VREG_FULL, TYPE_F64, dreg, sreg1, sreg1);
37843848
} else if (ins->inst_c1 == MONO_TYPE_R4) {
3785-
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1);
3786-
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg);
3849+
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, sreg1, sreg1);
3850+
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, dreg, dreg);
37873851
} else {
37883852
g_assert_not_reached ();
37893853
}
@@ -3792,7 +3856,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
37923856
case INTRINS_AARCH64_ADV_SIMD_UADDV:
37933857
case INTRINS_AARCH64_ADV_SIMD_SADDV:
37943858
if (get_type_size_macro (ins->inst_c1) == TYPE_I64)
3795-
arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1);
3859+
arm_neon_addp (code, VREG_FULL, TYPE_I64, dreg, sreg1, sreg1);
37963860
else
37973861
g_assert_not_reached (); // remaining int types are handled through the codegen table
37983862
break;
@@ -3802,6 +3866,52 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
38023866
}
38033867
break;
38043868
}
3869+
case OP_CREATE_SCALAR_INT: {
3870+
const int t = get_type_size_macro (ins->inst_c1);
3871+
arm_neon_eor_16b (code, dreg, dreg, dreg);
3872+
arm_neon_ins_g(code, t, dreg, sreg1, 0);
3873+
break;
3874+
}
3875+
case OP_CREATE_SCALAR_FLOAT: {
3876+
int t = 0;
3877+
switch (ins->inst_c1) {
3878+
case MONO_TYPE_R4:
3879+
t = SIZE_4;
3880+
break;
3881+
case MONO_TYPE_R8:
3882+
t = SIZE_8;
3883+
break;
3884+
}
3885+
// Use a temp register for zero op, as sreg1 and dreg share the same register here
3886+
arm_neon_eor_16b (code, NEON_TMP_REG, NEON_TMP_REG, NEON_TMP_REG);
3887+
arm_neon_ins_e(code, t, NEON_TMP_REG, sreg1, 0, 0);
3888+
arm_neon_mov (code, dreg, NEON_TMP_REG);
3889+
break;
3890+
}
3891+
case OP_CREATE_SCALAR_UNSAFE_INT: {
3892+
const int t = get_type_size_macro (ins->inst_c1);
3893+
arm_neon_ins_g(code, t, dreg, sreg1, 0);
3894+
break;
3895+
}
3896+
case OP_CREATE_SCALAR_UNSAFE_FLOAT: {
3897+
if (dreg != sreg1) {
3898+
int t = 0;
3899+
switch (ins->inst_c1) {
3900+
case MONO_TYPE_R4:
3901+
t = SIZE_4;
3902+
break;
3903+
case MONO_TYPE_R8:
3904+
t = SIZE_8;
3905+
break;
3906+
}
3907+
arm_neon_ins_e(code, t, dreg, sreg1, 0, 0);
3908+
}
3909+
break;
3910+
}
3911+
// Enable this when adding support for Narrow and enable support for Create at the same time
3912+
// case OP_XCONCAT:
3913+
// arm_neon_ext_16b(code, dreg, sreg1, sreg2, 8);
3914+
// break;
38053915

38063916
/* BRANCH */
38073917
case OP_BR:
@@ -3875,49 +3985,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
38753985
arm_cbnzx (code, sreg1, 0);
38763986
break;
38773987

3878-
/* SIMD that is not table-generated */
3879-
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
3880-
* move the following two to the codegen table in simd-arm64.h
3881-
*/
3882-
case OP_ONES_COMPLEMENT:
3883-
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
3884-
break;
3885-
case OP_NEGATION:
3886-
if (is_type_float_macro (ins->inst_c1)) {
3887-
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
3888-
} else {
3889-
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
3890-
}
3891-
break;
3892-
case OP_XBINOP:
3893-
switch (ins->inst_c0) {
3894-
case OP_IMAX:
3895-
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
3896-
break;
3897-
case OP_IMAX_UN:
3898-
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
3899-
break;
3900-
case OP_IMIN:
3901-
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
3902-
break;
3903-
case OP_IMIN_UN:
3904-
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
3905-
break;
3906-
default:
3907-
g_assert_not_reached ();
3908-
}
3909-
break;
3910-
case OP_XZERO:
3911-
arm_neon_eor_16b (code, dreg, dreg, dreg);
3912-
break;
3913-
case OP_XONES:
3914-
arm_neon_eor_16b (code, dreg, dreg, dreg);
3915-
arm_neon_not_16b (code, dreg, dreg);
3916-
break;
3917-
case OP_XEXTRACT:
3918-
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
3919-
break;
3920-
39213988
/* ALU */
39223989
case OP_IADD:
39233990
arm_addw (code, dreg, sreg1, sreg2);

src/mono/mono/mini/mini-ops.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,6 +1168,11 @@ MINI_OP3(OP_MULX_HL64, "mulxhl64", LREG, LREG, LREG, LREG)
11681168
MINI_OP(OP_CREATE_SCALAR_UNSAFE, "create_scalar_unsafe", XREG, XREG, NONE)
11691169
MINI_OP(OP_CREATE_SCALAR, "create_scalar", XREG, XREG, NONE)
11701170

1171+
MINI_OP(OP_CREATE_SCALAR_UNSAFE_INT, "create_scalar_unsafe_int", XREG, IREG, NONE)
1172+
MINI_OP(OP_CREATE_SCALAR_UNSAFE_FLOAT, "create_scalar_unsafe_float", XREG, FREG, NONE)
1173+
MINI_OP(OP_CREATE_SCALAR_INT, "create_scalar_int", XREG, IREG, NONE)
1174+
MINI_OP(OP_CREATE_SCALAR_FLOAT, "create_scalar_float", XREG, FREG, NONE)
1175+
11711176
MINI_OP(OP_XMOVE, "xmove", XREG, XREG, NONE)
11721177
MINI_OP(OP_XZERO, "xzero", XREG, NONE, NONE)
11731178
MINI_OP(OP_XONES, "xones", XREG, NONE, NONE)

0 commit comments

Comments
 (0)