Skip to content

Commit

Permalink
Float32 min/max/rounding intrinsics (#2684)
Browse files Browse the repository at this point in the history
  • Loading branch information
TheNumbat authored Jun 13, 2024
1 parent 7968d25 commit 12ba4d9
Show file tree
Hide file tree
Showing 19 changed files with 404 additions and 53 deletions.
15 changes: 11 additions & 4 deletions backend/amd64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,13 @@ let emit_simd_instr op i =
| CLMUL (Clmul_64 n) -> I.pclmulqdq (X86_dsl.int n) (arg i 1) (res i 0)
| BMI2 Extract_64 -> I.pext (arg i 1) (arg i 0) (res i 0)
| BMI2 Deposit_64 -> I.pdep (arg i 1) (arg i 0) (res i 0)
| SSE Round_current_f32_i64 -> I.cvtss2si (arg i 0) (res i 0)
| SSE Sqrt_scalar_f32 ->
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.sqrtss (arg i 0) (res i 0)
| SSE Max_scalar_f32 -> I.maxss (arg i 1) (res i 0)
| SSE Min_scalar_f32 -> I.minss (arg i 1) (res i 0)
| SSE (Cmp_f32 n) -> I.cmpps n (arg i 1) (res i 0)
| SSE Add_f32 -> I.addps (arg i 1) (res i 0)
| SSE Sub_f32 -> I.subps (arg i 1) (res i 0)
Expand All @@ -1109,10 +1116,6 @@ let emit_simd_instr op i =
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.sqrtsd (arg i 0) (res i 0)
| SSE2 Sqrt_scalar_f32 ->
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.sqrtss (arg i 0) (res i 0)
| SSE2 Sqrt_f64 -> I.sqrtpd (arg i 0) (res i 0)
| SSE2 Add_i8 -> I.paddb (arg i 1) (res i 0)
| SSE2 Add_i16 -> I.paddw (arg i 1) (res i 0)
Expand Down Expand Up @@ -1271,6 +1274,10 @@ let emit_simd_instr op i =
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.roundsd n (arg i 0) (res i 0)
| SSE41 (Round_scalar_f32 n) ->
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.roundss n (arg i 0) (res i 0)
| SSE41 (Round_f64 n) -> I.roundpd n (arg i 0) (res i 0)
| SSE41 (Round_f32 n) -> I.roundps n (arg i 0) (res i 0)
| SSE41 (Multi_sad_unsigned_i8 n) -> I.mpsadbw (X86_dsl.int n) (arg i 1) (res i 0)
Expand Down
91 changes: 55 additions & 36 deletions backend/amd64/simd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ type bmi2_operation =
| Extract_64

type sse_operation =
| Round_current_f32_i64
| Sqrt_scalar_f32
| Min_scalar_f32
| Max_scalar_f32
| Cmp_f32 of float_condition
| Add_f32
| Sub_f32
Expand All @@ -89,7 +93,6 @@ type sse_operation =
type sse2_operation =
| Round_current_f64_i64
| Sqrt_scalar_f64
| Sqrt_scalar_f32
| Min_scalar_f64
| Max_scalar_f64
| Sqrt_f64
Expand Down Expand Up @@ -207,6 +210,7 @@ type ssse3_operation =

type sse41_operation =
| Round_scalar_f64 of float_rounding
| Round_scalar_f32 of float_rounding
| Blend_16 of int
| Blend_32 of int
| Blend_64 of int
Expand Down Expand Up @@ -289,6 +293,10 @@ let equal_operation_bmi2 l r =

let equal_operation_sse l r =
match l, r with
| Round_current_f32_i64, Round_current_f32_i64
| Sqrt_scalar_f32, Sqrt_scalar_f32
| Min_scalar_f32, Min_scalar_f32
| Max_scalar_f32, Max_scalar_f32
| Add_f32, Add_f32
| Sub_f32, Sub_f32
| Mul_f32, Mul_f32
Expand All @@ -307,10 +315,11 @@ let equal_operation_sse l r =
true
| Cmp_f32 l, Cmp_f32 r when float_condition_equal l r -> true
| Shuffle_32 l, Shuffle_32 r when Int.equal l r -> true
| ( ( Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32 | Rcp_f32
| Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64
| Interleave_high_32 | Interleave_low_32_regs | Interleave_low_32
| Movemask_32 | Cmp_f32 _ | Shuffle_32 _ ),
| ( ( Round_current_f32_i64 | Sqrt_scalar_f32 | Min_scalar_f32
| Max_scalar_f32 | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32
| Min_f32 | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64
| Low_64_to_high_64 | Interleave_high_32 | Interleave_low_32_regs
| Interleave_low_32 | Movemask_32 | Cmp_f32 _ | Shuffle_32 _ ),
_ ) ->
false

Expand All @@ -320,7 +329,6 @@ let equal_operation_sse2 l r =
| Min_scalar_f64, Min_scalar_f64
| Max_scalar_f64, Max_scalar_f64
| Sqrt_scalar_f64, Sqrt_scalar_f64
| Sqrt_scalar_f32, Sqrt_scalar_f32
| Sqrt_f64, Sqrt_f64
| Add_i8, Add_i8
| Add_i16, Add_i16
Expand Down Expand Up @@ -409,26 +417,25 @@ let equal_operation_sse2 l r =
true
| Cmp_f64 l, Cmp_f64 r when float_condition_equal l r -> true
| ( ( Add_i8 | Add_i16 | Add_i32 | Add_i64 | Add_f64 | Min_scalar_f64
| Max_scalar_f64 | Round_current_f64_i64 | Sqrt_scalar_f64
| Sqrt_scalar_f32 | Sqrt_f64 | Add_saturating_unsigned_i8
| Add_saturating_unsigned_i16 | Add_saturating_i8 | Add_saturating_i16
| Sub_i8 | Sub_i16 | Sub_i32 | Sub_i64 | Sub_f64
| Sub_saturating_unsigned_i8 | Sub_saturating_unsigned_i16
| Sub_saturating_i8 | Sub_saturating_i16 | Max_unsigned_i8 | Max_i16
| Max_f64 | Min_unsigned_i8 | Min_i16 | Min_f64 | Mul_f64 | Div_f64
| And_bits | Andnot_bits | Or_bits | Xor_bits | Movemask_8 | Movemask_64
| Cmpeq_i8 | Cmpeq_i16 | Cmpeq_i32 | Cmpgt_i8 | Cmpgt_i16 | Cmpgt_i32
| I32_to_f64 | I32_to_f32 | F64_to_i32 | F64_to_f32 | F32_to_i32
| F32_to_f64 | SLL_i16 | SLL_i32 | SLL_i64 | SRL_i16 | SRL_i32 | SRL_i64
| SRA_i16 | SRA_i32 | I16_to_i8 | I32_to_i16 | I16_to_unsigned_i8
| I32_to_unsigned_i16 | Avg_unsigned_i8 | Avg_unsigned_i16
| SAD_unsigned_i8 | Interleave_high_8 | Interleave_high_16
| Interleave_high_64 | Interleave_low_8 | Interleave_low_16
| Interleave_low_64 | SLLi_i16 _ | SLLi_i32 _ | SLLi_i64 _ | SRLi_i16 _
| SRLi_i32 _ | SRLi_i64 _ | SRAi_i16 _ | SRAi_i32 _ | Shift_left_bytes _
| Shift_right_bytes _ | Cmp_f64 _ | Shuffle_64 _ | Shuffle_high_16 _
| Shuffle_low_16 _ | Mulhi_i16 | Mulhi_unsigned_i16 | Mullo_i16
| Mul_hadd_i16_to_i32 ),
| Max_scalar_f64 | Round_current_f64_i64 | Sqrt_scalar_f64 | Sqrt_f64
| Add_saturating_unsigned_i8 | Add_saturating_unsigned_i16
| Add_saturating_i8 | Add_saturating_i16 | Sub_i8 | Sub_i16 | Sub_i32
| Sub_i64 | Sub_f64 | Sub_saturating_unsigned_i8
| Sub_saturating_unsigned_i16 | Sub_saturating_i8 | Sub_saturating_i16
| Max_unsigned_i8 | Max_i16 | Max_f64 | Min_unsigned_i8 | Min_i16
| Min_f64 | Mul_f64 | Div_f64 | And_bits | Andnot_bits | Or_bits
| Xor_bits | Movemask_8 | Movemask_64 | Cmpeq_i8 | Cmpeq_i16 | Cmpeq_i32
| Cmpgt_i8 | Cmpgt_i16 | Cmpgt_i32 | I32_to_f64 | I32_to_f32 | F64_to_i32
| F64_to_f32 | F32_to_i32 | F32_to_f64 | SLL_i16 | SLL_i32 | SLL_i64
| SRL_i16 | SRL_i32 | SRL_i64 | SRA_i16 | SRA_i32 | I16_to_i8 | I32_to_i16
| I16_to_unsigned_i8 | I32_to_unsigned_i16 | Avg_unsigned_i8
| Avg_unsigned_i16 | SAD_unsigned_i8 | Interleave_high_8
| Interleave_high_16 | Interleave_high_64 | Interleave_low_8
| Interleave_low_16 | Interleave_low_64 | SLLi_i16 _ | SLLi_i32 _
| SLLi_i64 _ | SRLi_i16 _ | SRLi_i32 _ | SRLi_i64 _ | SRAi_i16 _
| SRAi_i32 _ | Shift_left_bytes _ | Shift_right_bytes _ | Cmp_f64 _
| Shuffle_64 _ | Shuffle_high_16 _ | Shuffle_low_16 _ | Mulhi_i16
| Mulhi_unsigned_i16 | Mullo_i16 | Mul_hadd_i16_to_i32 ),
_ ) ->
false

Expand Down Expand Up @@ -521,6 +528,7 @@ let equal_operation_sse41 l r =
when Int.equal l r ->
true
| Round_scalar_f64 l, Round_scalar_f64 r
| Round_scalar_f32 l, Round_scalar_f32 r
| Round_f64 l, Round_f64 r
| Round_f32 l, Round_f32 r
when float_rounding_equal l r ->
Expand All @@ -533,7 +541,7 @@ let equal_operation_sse41 l r =
| Blend_16 _ | Blend_32 _ | Blend_64 _ | Dp_f32 _ | Dp_f64 _ | Mullo_i32
| Extract_i8 _ | Extract_i16 _ | Extract_i32 _ | Extract_i64 _
| Insert_i8 _ | Insert_i16 _ | Insert_i32 _ | Insert_i64 _ | Round_f64 _
| Round_scalar_f64 _ | Round_f32 _ ),
| Round_scalar_f64 _ | Round_scalar_f32 _ | Round_f32 _ ),
_ ) ->
false

Expand Down Expand Up @@ -607,6 +615,13 @@ let print_operation_bmi2 printreg op ppf arg =

let print_operation_sse printreg op ppf arg =
match op with
| Round_current_f32_i64 ->
fprintf ppf "round_current_f32_i64 %a" printreg arg.(0)
| Sqrt_scalar_f32 -> fprintf ppf "sqrt_scalar_f32 %a" printreg arg.(0)
| Min_scalar_f32 ->
fprintf ppf "min_scalar_f32 %a %a" printreg arg.(0) printreg arg.(1)
| Max_scalar_f32 ->
fprintf ppf "max_scalar_f32 %a %a" printreg arg.(0) printreg arg.(1)
| Cmp_f32 i ->
fprintf ppf "cmp_f32[%a] %a %a" print_float_condition i printreg arg.(0)
printreg arg.(1)
Expand Down Expand Up @@ -636,7 +651,6 @@ let print_operation_sse printreg op ppf arg =
let print_operation_sse2 printreg op ppf arg =
match op with
| Sqrt_scalar_f64 -> fprintf ppf "sqrt_scalar_f64 %a" printreg arg.(0)
| Sqrt_scalar_f32 -> fprintf ppf "sqrt_scalar_f32 %a" printreg arg.(0)
| Min_scalar_f64 ->
fprintf ppf "min_scalar_f64 %a %a" printreg arg.(0) printreg arg.(1)
| Max_scalar_f64 ->
Expand Down Expand Up @@ -853,6 +867,9 @@ let print_operation_sse41 printreg op ppf arg =
| Round_scalar_f64 i ->
fprintf ppf "round_scalar_f64[%a] %a" print_float_rounding i printreg
arg.(0)
| Round_scalar_f32 i ->
fprintf ppf "round_scalar_f32[%a] %a" print_float_rounding i printreg
arg.(0)
| Round_f64 i ->
fprintf ppf "round_f64[%a] %a" print_float_rounding i printreg arg.(0)
| Round_f32 i ->
Expand Down Expand Up @@ -912,18 +929,20 @@ let class_of_operation_clmul = function Clmul_64 _ -> Pure
let class_of_operation_bmi2 = function Deposit_64 | Extract_64 -> Pure

let class_of_operation_sse = function
| Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
| Rcp_f32 | Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64
| Interleave_high_32 | Interleave_low_32 | Interleave_low_32_regs
| Movemask_32 | Shuffle_32 _ ->
| Round_current_f32_i64
(* CR-someday mslater: (SIMD) reads current rounding mode *)
| Sqrt_scalar_f32 | Min_scalar_f32 | Max_scalar_f32 | Cmp_f32 _ | Add_f32
| Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32 | Rcp_f32 | Sqrt_f32
| Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64 | Interleave_high_32
| Interleave_low_32 | Interleave_low_32_regs | Movemask_32 | Shuffle_32 _ ->
Pure

let class_of_operation_sse2 = function
| Round_current_f64_i64
(* CR-someday mslater: (SIMD) reads current rounding mode *)
| Add_i8 | Add_i16 | Add_i32 | Add_i64 | Add_f64 | Add_saturating_i8
| Min_scalar_f64 | Max_scalar_f64 | Sqrt_scalar_f64 | Sqrt_scalar_f32
| Sqrt_f64 | Add_saturating_i16 | Add_saturating_unsigned_i8
| Min_scalar_f64 | Max_scalar_f64 | Sqrt_scalar_f64 | Sqrt_f64
| Add_saturating_i16 | Add_saturating_unsigned_i8
| Add_saturating_unsigned_i16 | Sub_i8 | Sub_i16 | Sub_i32 | Sub_i64 | Sub_f64
| Sub_saturating_i8 | Sub_saturating_i16 | Sub_saturating_unsigned_i8
| Sub_saturating_unsigned_i16 | Max_unsigned_i8 | Max_i16 | Max_f64
Expand Down Expand Up @@ -962,8 +981,8 @@ let class_of_operation_sse41 = function
| Extract_i32 _ | Extract_i64 _ | Insert_i8 _ | Insert_i16 _ | Insert_i32 _
| Insert_i64 _ | Max_i8 | Max_i32 | Max_unsigned_i16 | Max_unsigned_i32
| Min_i8 | Min_i32 | Min_unsigned_i16 | Min_unsigned_i32 | Round_f64 _
| Round_scalar_f64 _ | Round_f32 _ | Multi_sad_unsigned_i8 _
| Minpos_unsigned_i16 | Mullo_i32 ->
| Round_scalar_f64 _ | Round_scalar_f32 _ | Round_f32 _
| Multi_sad_unsigned_i8 _ | Minpos_unsigned_i16 | Mullo_i32 ->
Pure

let class_of_operation_sse42 = function
Expand Down
13 changes: 8 additions & 5 deletions backend/amd64/simd_proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@ let register_behavior_clmul = function Clmul_64 _ -> R_RM_to_fst
let register_behavior_bmi2 = function Extract_64 | Deposit_64 -> R_RM_to_R

let register_behavior_sse = function
| Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
| Interleave_low_32 | Interleave_high_32 | Shuffle_32 _ ->
| Min_scalar_f32 | Max_scalar_f32 | Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32
| Div_f32 | Max_f32 | Min_f32 | Interleave_low_32 | Interleave_high_32
| Shuffle_32 _ ->
R_RM_to_fst
| Rcp_f32 | Sqrt_f32 | Rsqrt_f32 -> RM_to_R
| Round_current_f32_i64 | Sqrt_scalar_f32 | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 ->
RM_to_R
| Interleave_low_32_regs | High_64_to_low_64 | Low_64_to_high_64 -> R_R_to_fst
| Movemask_32 -> R_to_R

Expand All @@ -64,7 +66,7 @@ let register_behavior_sse2 = function
R_RM_to_fst
| Shuffle_high_16 _ | Shuffle_low_16 _ | I32_to_f64 | I32_to_f32 | F64_to_i32
| Round_current_f64_i64 | F64_to_f32 | F32_to_i32 | F32_to_f64 | Sqrt_f64
| Sqrt_scalar_f64 | Sqrt_scalar_f32 ->
| Sqrt_scalar_f64 ->
RM_to_R
| SLLi_i16 _ | SLLi_i32 _ | SLLi_i64 _ | SRLi_i16 _ | SRLi_i32 _ | SRLi_i64 _
| SRAi_i16 _ | SRAi_i32 _ | Shift_left_bytes _ | Shift_right_bytes _ ->
Expand All @@ -91,7 +93,8 @@ let register_behavior_sse41 = function
R_RM_to_fst
| I8_sx_i16 | I8_sx_i32 | I8_sx_i64 | I16_sx_i32 | I16_sx_i64 | I32_sx_i64
| I8_zx_i16 | I8_zx_i32 | I8_zx_i64 | I16_zx_i32 | I16_zx_i64 | I32_zx_i64
| Round_f64 _ | Round_f32 _ | Minpos_unsigned_i16 | Round_scalar_f64 _ ->
| Round_f64 _ | Round_f32 _ | Minpos_unsigned_i16 | Round_scalar_f64 _
| Round_scalar_f32 _ ->
RM_to_R
| Blendv_8 | Blendv_32 | Blendv_64 -> R_RM_xmm0_to_fst
| Extract_i64 _ | Extract_i32 _ -> R_to_RM
Expand Down
15 changes: 14 additions & 1 deletion backend/amd64/simd_selection.ml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ let select_operation_bmi2 op args =

let select_operation_sse op args =
match op with
| "caml_sse_float32_sqrt" | "sqrtf" -> Some (Sqrt_scalar_f32, args)
| "caml_sse_float32_max" -> Some (Max_scalar_f32, args)
| "caml_sse_float32_min" -> Some (Min_scalar_f32, args)
| "caml_sse_cast_float32_int64" -> Some (Round_current_f32_i64, args)
| "caml_sse_float32x4_cmp" ->
let i, args = extract_constant args ~max:7 op in
Some (Cmp_f32 (float_condition_of_int i), args)
Expand All @@ -103,7 +107,6 @@ let select_operation_sse op args =
let select_operation_sse2 op args =
match op with
| "caml_sse2_float64_sqrt" | "sqrt" -> Some (Sqrt_scalar_f64, args)
| "caml_sse2_float32_sqrt" | "sqrtf" -> Some (Sqrt_scalar_f32, args)
| "caml_sse2_float64_max" -> Some (Max_scalar_f64, args)
| "caml_sse2_float64_min" -> Some (Min_scalar_f64, args)
| "caml_sse2_cast_float64_int64" -> Some (Round_current_f64_i64, args)
Expand Down Expand Up @@ -335,8 +338,18 @@ let select_operation_sse41 op args =
let i, args = extract_constant args ~max:15 op in
Some (Round_f64 (float_rounding_of_int i), args)
| "caml_sse41_float64_round" ->
(* CR-someday mslater: the following CR also applies here, but this
builtin is not exposed by any of the stdlib libraries. *)
let i, args = extract_constant args ~max:15 op in
Some (Round_scalar_f64 (float_rounding_of_int i), args)
| "caml_sse41_float32_round" ->
(* CR-someday mslater: this builtin is exposed by float32.ml, so must
actually be cross-platform. Currently, non-amd64 architectures will
fall back to a C implementation. If we want the arm64 backend to
specialize it, we should redefine the constant mapping from the amd64
values to a new sum type. *)
let i, args = extract_constant args ~max:15 op in
Some (Round_scalar_f32 (float_rounding_of_int i), args)
| "caml_sse41_int8x16_max" -> Some (Max_i8, args)
| "caml_sse41_int32x4_max" -> Some (Max_i32, args)
| "caml_sse41_int16x8_max_unsigned" -> Some (Max_unsigned_i16, args)
Expand Down
3 changes: 3 additions & 0 deletions backend/x86_ast.mli
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ type arg =
| Mem64_RIP of data_type * string * int

type sse_instruction =
| MINSS of arg * arg
| MAXSS of arg * arg
| CMPPS of float_condition * arg * arg
| SHUFPS of arg * arg * arg
| ADDPS of arg * arg
Expand Down Expand Up @@ -280,6 +282,7 @@ type sse41_instruction =
| PMINUD of arg * arg
| ROUNDPD of rounding * arg * arg
| ROUNDPS of rounding * arg * arg
| ROUNDSS of rounding * arg * arg
| MPSADBW of arg * arg * arg
| PHMINPOSUW of arg * arg
| PMULLD of arg * arg
Expand Down
18 changes: 18 additions & 0 deletions backend/x86_binary_emitter.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1091,6 +1091,7 @@ let emit_dpps = suffix emit_osize_rf_rfm_3A 0x40
let emit_dppd = suffix emit_osize_rf_rfm_3A 0x41
let emit_roundps = suffix emit_osize_rf_rfm_3A 0x08
let emit_roundpd = suffix emit_osize_rf_rfm_3A 0x09
let emit_roundss = suffix emit_osize_rf_rfm_3A 0x0A

let emit_pmulhw = emit_osize_rf_rfm 0xE5
let emit_pmulhuw = emit_osize_rf_rfm 0xE4
Expand Down Expand Up @@ -1754,6 +1755,20 @@ let emit_mfence b = buf_opcodes b [ 0x0F; 0xAE; 0xF0 ]

let emit_leave b = buf_int8 b 0xC9

let emit_maxss b ~dst ~src =
match (dst, src) with
| (Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm)) ->
buf_int8 b 0xF3;
emit_mod_rm_reg b no_rex [ 0x0F; 0x5F ] rm (rd_of_regf reg)
| _ -> assert false

let emit_minss b ~dst ~src =
match (dst, src) with
| (Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm)) ->
buf_int8 b 0xF3;
emit_mod_rm_reg b no_rex [ 0x0F; 0x5D ] rm (rd_of_regf reg)
| _ -> assert false

let emit_maxsd b ~dst ~src =
match (dst, src) with
| (Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm)) ->
Expand Down Expand Up @@ -1933,6 +1948,8 @@ let assemble_instr b loc = function
| XORPS (src, dst) -> emit_xor_float ~width:Cmm.Float32 b dst src
| ANDPS (src, dst) -> emit_and_float ~width:Cmm.Float32 b dst src
| CMPSS (condition, src, dst) -> emit_cmp_float ~width:Cmm.Float32 b ~condition ~dst ~src
| SSE MINSS (src, dst) -> emit_minss b ~dst ~src
| SSE MAXSS (src, dst) -> emit_maxss b ~dst ~src
| SSE CMPPS (cmp, src, dst) -> emit_cmpps b (imm8_of_float_condition cmp) dst src
| SSE ADDPS (src, dst) -> emit_addps b dst src
| SSE SUBPS (src, dst) -> emit_subps b dst src
Expand Down Expand Up @@ -2095,6 +2112,7 @@ let assemble_instr b loc = function
| SSE41 PMINUD (src, dst) -> emit_pminud b dst src
| SSE41 ROUNDPD (n, src, dst) -> emit_roundpd b (imm8_of_rounding n) dst src
| SSE41 ROUNDPS (n, src, dst) -> emit_roundps b (imm8_of_rounding n) dst src
| SSE41 ROUNDSS (n, src, dst) -> emit_roundss b (imm8_of_rounding n) dst src
| SSE41 PHMINPOSUW (src, dst) -> emit_phminposuw b dst src
| SSE41 PMULLD (src, dst) -> emit_pmulld b dst src
| SSE41 MPSADBW (n, src, dst) -> emit_mpsadbw b (imm n) dst src
Expand Down
3 changes: 3 additions & 0 deletions backend/x86_dsl.ml
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ module I = struct
let andps x y = emit (ANDPS (x, y))
let cmpss i x y = emit (CMPSS (i, x, y))

let minss x y = emit (SSE (MINSS (x, y)))
let maxss x y = emit (SSE (MAXSS (x, y)))
let cmpps i x y = emit (SSE (CMPPS (i, x, y)))
let shufps i x y = emit (SSE (SHUFPS (i, x, y)))
let addps x y = emit (SSE (ADDPS (x, y)))
Expand Down Expand Up @@ -372,6 +374,7 @@ module I = struct
let pminud x y = emit (SSE41 (PMINUD (x, y)))
let roundpd i x y = emit (SSE41 (ROUNDPD (i, x, y)))
let roundps i x y = emit (SSE41 (ROUNDPS (i, x, y)))
let roundss i x y = emit (SSE41 (ROUNDSS (i, x, y)))
let mpsadbw i x y = emit (SSE41 (MPSADBW (i, x, y)))
let phminposuw x y = emit (SSE41 (PHMINPOSUW (x, y)))
let pmulld x y = emit (SSE41 (PMULLD (x, y)))
Expand Down
Loading

0 comments on commit 12ba4d9

Please sign in to comment.