Skip to content

Float32 min/max/rounding intrinsics #2684

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions backend/amd64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,13 @@ let emit_simd_instr op i =
| CLMUL (Clmul_64 n) -> I.pclmulqdq (X86_dsl.int n) (arg i 1) (res i 0)
| BMI2 Extract_64 -> I.pext (arg i 1) (arg i 0) (res i 0)
| BMI2 Deposit_64 -> I.pdep (arg i 1) (arg i 0) (res i 0)
| SSE Round_current_f32_i64 -> I.cvtss2si (arg i 0) (res i 0)
| SSE Sqrt_scalar_f32 ->
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.sqrtss (arg i 0) (res i 0)
| SSE Max_scalar_f32 -> I.maxss (arg i 1) (res i 0)
| SSE Min_scalar_f32 -> I.minss (arg i 1) (res i 0)
| SSE (Cmp_f32 n) -> I.cmpps n (arg i 1) (res i 0)
| SSE Add_f32 -> I.addps (arg i 1) (res i 0)
| SSE Sub_f32 -> I.subps (arg i 1) (res i 0)
Expand All @@ -1109,10 +1116,6 @@ let emit_simd_instr op i =
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.sqrtsd (arg i 0) (res i 0)
| SSE2 Sqrt_scalar_f32 ->
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.sqrtss (arg i 0) (res i 0)
| SSE2 Sqrt_f64 -> I.sqrtpd (arg i 0) (res i 0)
| SSE2 Add_i8 -> I.paddb (arg i 1) (res i 0)
| SSE2 Add_i16 -> I.paddw (arg i 1) (res i 0)
Expand Down Expand Up @@ -1271,6 +1274,10 @@ let emit_simd_instr op i =
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.roundsd n (arg i 0) (res i 0)
| SSE41 (Round_scalar_f32 n) ->
if arg i 0 <> res i 0 then
I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
I.roundss n (arg i 0) (res i 0)
| SSE41 (Round_f64 n) -> I.roundpd n (arg i 0) (res i 0)
| SSE41 (Round_f32 n) -> I.roundps n (arg i 0) (res i 0)
| SSE41 (Multi_sad_unsigned_i8 n) -> I.mpsadbw (X86_dsl.int n) (arg i 1) (res i 0)
Expand Down
91 changes: 55 additions & 36 deletions backend/amd64/simd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ type bmi2_operation =
| Extract_64

type sse_operation =
| Round_current_f32_i64
| Sqrt_scalar_f32
| Min_scalar_f32
| Max_scalar_f32
| Cmp_f32 of float_condition
| Add_f32
| Sub_f32
Expand All @@ -89,7 +93,6 @@ type sse_operation =
type sse2_operation =
| Round_current_f64_i64
| Sqrt_scalar_f64
| Sqrt_scalar_f32
| Min_scalar_f64
| Max_scalar_f64
| Sqrt_f64
Expand Down Expand Up @@ -207,6 +210,7 @@ type ssse3_operation =

type sse41_operation =
| Round_scalar_f64 of float_rounding
| Round_scalar_f32 of float_rounding
| Blend_16 of int
| Blend_32 of int
| Blend_64 of int
Expand Down Expand Up @@ -289,6 +293,10 @@ let equal_operation_bmi2 l r =

let equal_operation_sse l r =
match l, r with
| Round_current_f32_i64, Round_current_f32_i64
| Sqrt_scalar_f32, Sqrt_scalar_f32
| Min_scalar_f32, Min_scalar_f32
| Max_scalar_f32, Max_scalar_f32
| Add_f32, Add_f32
| Sub_f32, Sub_f32
| Mul_f32, Mul_f32
Expand All @@ -307,10 +315,11 @@ let equal_operation_sse l r =
true
| Cmp_f32 l, Cmp_f32 r when float_condition_equal l r -> true
| Shuffle_32 l, Shuffle_32 r when Int.equal l r -> true
| ( ( Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32 | Rcp_f32
| Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64
| Interleave_high_32 | Interleave_low_32_regs | Interleave_low_32
| Movemask_32 | Cmp_f32 _ | Shuffle_32 _ ),
| ( ( Round_current_f32_i64 | Sqrt_scalar_f32 | Min_scalar_f32
| Max_scalar_f32 | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32
| Min_f32 | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64
| Low_64_to_high_64 | Interleave_high_32 | Interleave_low_32_regs
| Interleave_low_32 | Movemask_32 | Cmp_f32 _ | Shuffle_32 _ ),
_ ) ->
false

Expand All @@ -320,7 +329,6 @@ let equal_operation_sse2 l r =
| Min_scalar_f64, Min_scalar_f64
| Max_scalar_f64, Max_scalar_f64
| Sqrt_scalar_f64, Sqrt_scalar_f64
| Sqrt_scalar_f32, Sqrt_scalar_f32
| Sqrt_f64, Sqrt_f64
| Add_i8, Add_i8
| Add_i16, Add_i16
Expand Down Expand Up @@ -409,26 +417,25 @@ let equal_operation_sse2 l r =
true
| Cmp_f64 l, Cmp_f64 r when float_condition_equal l r -> true
| ( ( Add_i8 | Add_i16 | Add_i32 | Add_i64 | Add_f64 | Min_scalar_f64
| Max_scalar_f64 | Round_current_f64_i64 | Sqrt_scalar_f64
| Sqrt_scalar_f32 | Sqrt_f64 | Add_saturating_unsigned_i8
| Add_saturating_unsigned_i16 | Add_saturating_i8 | Add_saturating_i16
| Sub_i8 | Sub_i16 | Sub_i32 | Sub_i64 | Sub_f64
| Sub_saturating_unsigned_i8 | Sub_saturating_unsigned_i16
| Sub_saturating_i8 | Sub_saturating_i16 | Max_unsigned_i8 | Max_i16
| Max_f64 | Min_unsigned_i8 | Min_i16 | Min_f64 | Mul_f64 | Div_f64
| And_bits | Andnot_bits | Or_bits | Xor_bits | Movemask_8 | Movemask_64
| Cmpeq_i8 | Cmpeq_i16 | Cmpeq_i32 | Cmpgt_i8 | Cmpgt_i16 | Cmpgt_i32
| I32_to_f64 | I32_to_f32 | F64_to_i32 | F64_to_f32 | F32_to_i32
| F32_to_f64 | SLL_i16 | SLL_i32 | SLL_i64 | SRL_i16 | SRL_i32 | SRL_i64
| SRA_i16 | SRA_i32 | I16_to_i8 | I32_to_i16 | I16_to_unsigned_i8
| I32_to_unsigned_i16 | Avg_unsigned_i8 | Avg_unsigned_i16
| SAD_unsigned_i8 | Interleave_high_8 | Interleave_high_16
| Interleave_high_64 | Interleave_low_8 | Interleave_low_16
| Interleave_low_64 | SLLi_i16 _ | SLLi_i32 _ | SLLi_i64 _ | SRLi_i16 _
| SRLi_i32 _ | SRLi_i64 _ | SRAi_i16 _ | SRAi_i32 _ | Shift_left_bytes _
| Shift_right_bytes _ | Cmp_f64 _ | Shuffle_64 _ | Shuffle_high_16 _
| Shuffle_low_16 _ | Mulhi_i16 | Mulhi_unsigned_i16 | Mullo_i16
| Mul_hadd_i16_to_i32 ),
| Max_scalar_f64 | Round_current_f64_i64 | Sqrt_scalar_f64 | Sqrt_f64
| Add_saturating_unsigned_i8 | Add_saturating_unsigned_i16
| Add_saturating_i8 | Add_saturating_i16 | Sub_i8 | Sub_i16 | Sub_i32
| Sub_i64 | Sub_f64 | Sub_saturating_unsigned_i8
| Sub_saturating_unsigned_i16 | Sub_saturating_i8 | Sub_saturating_i16
| Max_unsigned_i8 | Max_i16 | Max_f64 | Min_unsigned_i8 | Min_i16
| Min_f64 | Mul_f64 | Div_f64 | And_bits | Andnot_bits | Or_bits
| Xor_bits | Movemask_8 | Movemask_64 | Cmpeq_i8 | Cmpeq_i16 | Cmpeq_i32
| Cmpgt_i8 | Cmpgt_i16 | Cmpgt_i32 | I32_to_f64 | I32_to_f32 | F64_to_i32
| F64_to_f32 | F32_to_i32 | F32_to_f64 | SLL_i16 | SLL_i32 | SLL_i64
| SRL_i16 | SRL_i32 | SRL_i64 | SRA_i16 | SRA_i32 | I16_to_i8 | I32_to_i16
| I16_to_unsigned_i8 | I32_to_unsigned_i16 | Avg_unsigned_i8
| Avg_unsigned_i16 | SAD_unsigned_i8 | Interleave_high_8
| Interleave_high_16 | Interleave_high_64 | Interleave_low_8
| Interleave_low_16 | Interleave_low_64 | SLLi_i16 _ | SLLi_i32 _
| SLLi_i64 _ | SRLi_i16 _ | SRLi_i32 _ | SRLi_i64 _ | SRAi_i16 _
| SRAi_i32 _ | Shift_left_bytes _ | Shift_right_bytes _ | Cmp_f64 _
| Shuffle_64 _ | Shuffle_high_16 _ | Shuffle_low_16 _ | Mulhi_i16
| Mulhi_unsigned_i16 | Mullo_i16 | Mul_hadd_i16_to_i32 ),
_ ) ->
false

Expand Down Expand Up @@ -521,6 +528,7 @@ let equal_operation_sse41 l r =
when Int.equal l r ->
true
| Round_scalar_f64 l, Round_scalar_f64 r
| Round_scalar_f32 l, Round_scalar_f32 r
| Round_f64 l, Round_f64 r
| Round_f32 l, Round_f32 r
when float_rounding_equal l r ->
Expand All @@ -533,7 +541,7 @@ let equal_operation_sse41 l r =
| Blend_16 _ | Blend_32 _ | Blend_64 _ | Dp_f32 _ | Dp_f64 _ | Mullo_i32
| Extract_i8 _ | Extract_i16 _ | Extract_i32 _ | Extract_i64 _
| Insert_i8 _ | Insert_i16 _ | Insert_i32 _ | Insert_i64 _ | Round_f64 _
| Round_scalar_f64 _ | Round_f32 _ ),
| Round_scalar_f64 _ | Round_scalar_f32 _ | Round_f32 _ ),
_ ) ->
false

Expand Down Expand Up @@ -607,6 +615,13 @@ let print_operation_bmi2 printreg op ppf arg =

let print_operation_sse printreg op ppf arg =
match op with
| Round_current_f32_i64 ->
fprintf ppf "round_current_f32_i64 %a" printreg arg.(0)
| Sqrt_scalar_f32 -> fprintf ppf "sqrt_scalar_f32 %a" printreg arg.(0)
| Min_scalar_f32 ->
fprintf ppf "min_scalar_f32 %a %a" printreg arg.(0) printreg arg.(1)
| Max_scalar_f32 ->
fprintf ppf "max_scalar_f32 %a %a" printreg arg.(0) printreg arg.(1)
| Cmp_f32 i ->
fprintf ppf "cmp_f32[%a] %a %a" print_float_condition i printreg arg.(0)
printreg arg.(1)
Expand Down Expand Up @@ -636,7 +651,6 @@ let print_operation_sse printreg op ppf arg =
let print_operation_sse2 printreg op ppf arg =
match op with
| Sqrt_scalar_f64 -> fprintf ppf "sqrt_scalar_f64 %a" printreg arg.(0)
| Sqrt_scalar_f32 -> fprintf ppf "sqrt_scalar_f32 %a" printreg arg.(0)
| Min_scalar_f64 ->
fprintf ppf "min_scalar_f64 %a %a" printreg arg.(0) printreg arg.(1)
| Max_scalar_f64 ->
Expand Down Expand Up @@ -853,6 +867,9 @@ let print_operation_sse41 printreg op ppf arg =
| Round_scalar_f64 i ->
fprintf ppf "round_scalar_f64[%a] %a" print_float_rounding i printreg
arg.(0)
| Round_scalar_f32 i ->
fprintf ppf "round_scalar_f32[%a] %a" print_float_rounding i printreg
arg.(0)
| Round_f64 i ->
fprintf ppf "round_f64[%a] %a" print_float_rounding i printreg arg.(0)
| Round_f32 i ->
Expand Down Expand Up @@ -912,18 +929,20 @@ let class_of_operation_clmul = function Clmul_64 _ -> Pure
let class_of_operation_bmi2 = function Deposit_64 | Extract_64 -> Pure

let class_of_operation_sse = function
| Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
| Rcp_f32 | Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64
| Interleave_high_32 | Interleave_low_32 | Interleave_low_32_regs
| Movemask_32 | Shuffle_32 _ ->
| Round_current_f32_i64
(* CR-someday mslater: (SIMD) reads current rounding mode *)
| Sqrt_scalar_f32 | Min_scalar_f32 | Max_scalar_f32 | Cmp_f32 _ | Add_f32
| Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32 | Rcp_f32 | Sqrt_f32
| Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64 | Interleave_high_32
| Interleave_low_32 | Interleave_low_32_regs | Movemask_32 | Shuffle_32 _ ->
Pure

let class_of_operation_sse2 = function
| Round_current_f64_i64
(* CR-someday mslater: (SIMD) reads current rounding mode *)
| Add_i8 | Add_i16 | Add_i32 | Add_i64 | Add_f64 | Add_saturating_i8
| Min_scalar_f64 | Max_scalar_f64 | Sqrt_scalar_f64 | Sqrt_scalar_f32
| Sqrt_f64 | Add_saturating_i16 | Add_saturating_unsigned_i8
| Min_scalar_f64 | Max_scalar_f64 | Sqrt_scalar_f64 | Sqrt_f64
| Add_saturating_i16 | Add_saturating_unsigned_i8
| Add_saturating_unsigned_i16 | Sub_i8 | Sub_i16 | Sub_i32 | Sub_i64 | Sub_f64
| Sub_saturating_i8 | Sub_saturating_i16 | Sub_saturating_unsigned_i8
| Sub_saturating_unsigned_i16 | Max_unsigned_i8 | Max_i16 | Max_f64
Expand Down Expand Up @@ -962,8 +981,8 @@ let class_of_operation_sse41 = function
| Extract_i32 _ | Extract_i64 _ | Insert_i8 _ | Insert_i16 _ | Insert_i32 _
| Insert_i64 _ | Max_i8 | Max_i32 | Max_unsigned_i16 | Max_unsigned_i32
| Min_i8 | Min_i32 | Min_unsigned_i16 | Min_unsigned_i32 | Round_f64 _
| Round_scalar_f64 _ | Round_f32 _ | Multi_sad_unsigned_i8 _
| Minpos_unsigned_i16 | Mullo_i32 ->
| Round_scalar_f64 _ | Round_scalar_f32 _ | Round_f32 _
| Multi_sad_unsigned_i8 _ | Minpos_unsigned_i16 | Mullo_i32 ->
Pure

let class_of_operation_sse42 = function
Expand Down
13 changes: 8 additions & 5 deletions backend/amd64/simd_proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@ let register_behavior_clmul = function Clmul_64 _ -> R_RM_to_fst
let register_behavior_bmi2 = function Extract_64 | Deposit_64 -> R_RM_to_R

let register_behavior_sse = function
| Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
| Interleave_low_32 | Interleave_high_32 | Shuffle_32 _ ->
| Min_scalar_f32 | Max_scalar_f32 | Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32
| Div_f32 | Max_f32 | Min_f32 | Interleave_low_32 | Interleave_high_32
| Shuffle_32 _ ->
R_RM_to_fst
| Rcp_f32 | Sqrt_f32 | Rsqrt_f32 -> RM_to_R
| Round_current_f32_i64 | Sqrt_scalar_f32 | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 ->
RM_to_R
| Interleave_low_32_regs | High_64_to_low_64 | Low_64_to_high_64 -> R_R_to_fst
| Movemask_32 -> R_to_R

Expand All @@ -64,7 +66,7 @@ let register_behavior_sse2 = function
R_RM_to_fst
| Shuffle_high_16 _ | Shuffle_low_16 _ | I32_to_f64 | I32_to_f32 | F64_to_i32
| Round_current_f64_i64 | F64_to_f32 | F32_to_i32 | F32_to_f64 | Sqrt_f64
| Sqrt_scalar_f64 | Sqrt_scalar_f32 ->
| Sqrt_scalar_f64 ->
RM_to_R
| SLLi_i16 _ | SLLi_i32 _ | SLLi_i64 _ | SRLi_i16 _ | SRLi_i32 _ | SRLi_i64 _
| SRAi_i16 _ | SRAi_i32 _ | Shift_left_bytes _ | Shift_right_bytes _ ->
Expand All @@ -91,7 +93,8 @@ let register_behavior_sse41 = function
R_RM_to_fst
| I8_sx_i16 | I8_sx_i32 | I8_sx_i64 | I16_sx_i32 | I16_sx_i64 | I32_sx_i64
| I8_zx_i16 | I8_zx_i32 | I8_zx_i64 | I16_zx_i32 | I16_zx_i64 | I32_zx_i64
| Round_f64 _ | Round_f32 _ | Minpos_unsigned_i16 | Round_scalar_f64 _ ->
| Round_f64 _ | Round_f32 _ | Minpos_unsigned_i16 | Round_scalar_f64 _
| Round_scalar_f32 _ ->
RM_to_R
| Blendv_8 | Blendv_32 | Blendv_64 -> R_RM_xmm0_to_fst
| Extract_i64 _ | Extract_i32 _ -> R_to_RM
Expand Down
15 changes: 14 additions & 1 deletion backend/amd64/simd_selection.ml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ let select_operation_bmi2 op args =

let select_operation_sse op args =
match op with
| "caml_sse_float32_sqrt" | "sqrtf" -> Some (Sqrt_scalar_f32, args)
| "caml_sse_float32_max" -> Some (Max_scalar_f32, args)
| "caml_sse_float32_min" -> Some (Min_scalar_f32, args)
| "caml_sse_cast_float32_int64" -> Some (Round_current_f32_i64, args)
| "caml_sse_float32x4_cmp" ->
let i, args = extract_constant args ~max:7 op in
Some (Cmp_f32 (float_condition_of_int i), args)
Expand All @@ -103,7 +107,6 @@ let select_operation_sse op args =
let select_operation_sse2 op args =
match op with
| "caml_sse2_float64_sqrt" | "sqrt" -> Some (Sqrt_scalar_f64, args)
| "caml_sse2_float32_sqrt" | "sqrtf" -> Some (Sqrt_scalar_f32, args)
| "caml_sse2_float64_max" -> Some (Max_scalar_f64, args)
| "caml_sse2_float64_min" -> Some (Min_scalar_f64, args)
| "caml_sse2_cast_float64_int64" -> Some (Round_current_f64_i64, args)
Expand Down Expand Up @@ -335,8 +338,18 @@ let select_operation_sse41 op args =
let i, args = extract_constant args ~max:15 op in
Some (Round_f64 (float_rounding_of_int i), args)
| "caml_sse41_float64_round" ->
(* CR-someday mslater: the following CR also applies here, but this
builtin is not exposed by any of the stdlib libraries. *)
let i, args = extract_constant args ~max:15 op in
Some (Round_scalar_f64 (float_rounding_of_int i), args)
| "caml_sse41_float32_round" ->
(* CR-someday mslater: this builtin is exposed by float32.ml, so must
actually be cross-platform. Currently, non-amd64 architectures will
fall back to a C implementation. If we want the arm64 backend to
specialize it, we should redefine the constant mapping from the amd64
values to a new sum type. *)
let i, args = extract_constant args ~max:15 op in
Some (Round_scalar_f32 (float_rounding_of_int i), args)
| "caml_sse41_int8x16_max" -> Some (Max_i8, args)
| "caml_sse41_int32x4_max" -> Some (Max_i32, args)
| "caml_sse41_int16x8_max_unsigned" -> Some (Max_unsigned_i16, args)
Expand Down
4 changes: 4 additions & 0 deletions backend/cmm_builtins.ml
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,10 @@ let transl_builtin name args dbg typ_res =
Some (Cop (Creinterpret_cast Float32_of_int32, args, dbg))
| "caml_float32_to_bits" ->
Some (Cop (Creinterpret_cast Int32_of_float32, args, dbg))
| "caml_float32_to_int64" ->
Some (Cop (Cstatic_cast (Int_of_float Float32), args, dbg))
| "caml_float32_of_int64" ->
Some (Cop (Cstatic_cast (Float_of_int Float32), args, dbg))
| "caml_int_clz_tagged_to_untagged" ->
(* The tag does not change the number of leading zeros. The advantage of
keeping the tag is it guarantees that, on x86-64, the input to the BSR
Expand Down
3 changes: 3 additions & 0 deletions backend/x86_ast.mli
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ type arg =
| Mem64_RIP of data_type * string * int

type sse_instruction =
| MINSS of arg * arg
| MAXSS of arg * arg
| CMPPS of float_condition * arg * arg
| SHUFPS of arg * arg * arg
| ADDPS of arg * arg
Expand Down Expand Up @@ -280,6 +282,7 @@ type sse41_instruction =
| PMINUD of arg * arg
| ROUNDPD of rounding * arg * arg
| ROUNDPS of rounding * arg * arg
| ROUNDSS of rounding * arg * arg
| MPSADBW of arg * arg * arg
| PHMINPOSUW of arg * arg
| PMULLD of arg * arg
Expand Down
18 changes: 18 additions & 0 deletions backend/x86_binary_emitter.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1091,6 +1091,7 @@ let emit_dpps = suffix emit_osize_rf_rfm_3A 0x40
let emit_dppd = suffix emit_osize_rf_rfm_3A 0x41
let emit_roundps = suffix emit_osize_rf_rfm_3A 0x08
let emit_roundpd = suffix emit_osize_rf_rfm_3A 0x09
let emit_roundss = suffix emit_osize_rf_rfm_3A 0x0A

let emit_pmulhw = emit_osize_rf_rfm 0xE5
let emit_pmulhuw = emit_osize_rf_rfm 0xE4
Expand Down Expand Up @@ -1754,6 +1755,20 @@ let emit_mfence b = buf_opcodes b [ 0x0F; 0xAE; 0xF0 ]

let emit_leave b = buf_int8 b 0xC9

let emit_maxss b ~dst ~src =
match (dst, src) with
| (Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm)) ->
buf_int8 b 0xF3;
emit_mod_rm_reg b no_rex [ 0x0F; 0x5F ] rm (rd_of_regf reg)
| _ -> assert false

let emit_minss b ~dst ~src =
match (dst, src) with
| (Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm)) ->
buf_int8 b 0xF3;
emit_mod_rm_reg b no_rex [ 0x0F; 0x5D ] rm (rd_of_regf reg)
| _ -> assert false

let emit_maxsd b ~dst ~src =
match (dst, src) with
| (Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm)) ->
Expand Down Expand Up @@ -1933,6 +1948,8 @@ let assemble_instr b loc = function
| XORPS (src, dst) -> emit_xor_float ~width:Cmm.Float32 b dst src
| ANDPS (src, dst) -> emit_and_float ~width:Cmm.Float32 b dst src
| CMPSS (condition, src, dst) -> emit_cmp_float ~width:Cmm.Float32 b ~condition ~dst ~src
| SSE MINSS (src, dst) -> emit_minss b ~dst ~src
| SSE MAXSS (src, dst) -> emit_maxss b ~dst ~src
| SSE CMPPS (cmp, src, dst) -> emit_cmpps b (imm8_of_float_condition cmp) dst src
| SSE ADDPS (src, dst) -> emit_addps b dst src
| SSE SUBPS (src, dst) -> emit_subps b dst src
Expand Down Expand Up @@ -2095,6 +2112,7 @@ let assemble_instr b loc = function
| SSE41 PMINUD (src, dst) -> emit_pminud b dst src
| SSE41 ROUNDPD (n, src, dst) -> emit_roundpd b (imm8_of_rounding n) dst src
| SSE41 ROUNDPS (n, src, dst) -> emit_roundps b (imm8_of_rounding n) dst src
| SSE41 ROUNDSS (n, src, dst) -> emit_roundss b (imm8_of_rounding n) dst src
| SSE41 PHMINPOSUW (src, dst) -> emit_phminposuw b dst src
| SSE41 PMULLD (src, dst) -> emit_pmulld b dst src
| SSE41 MPSADBW (n, src, dst) -> emit_mpsadbw b (imm n) dst src
Expand Down
3 changes: 3 additions & 0 deletions backend/x86_dsl.ml
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ module I = struct
let andps x y = emit (ANDPS (x, y))
let cmpss i x y = emit (CMPSS (i, x, y))

let minss x y = emit (SSE (MINSS (x, y)))
let maxss x y = emit (SSE (MAXSS (x, y)))
let cmpps i x y = emit (SSE (CMPPS (i, x, y)))
let shufps i x y = emit (SSE (SHUFPS (i, x, y)))
let addps x y = emit (SSE (ADDPS (x, y)))
Expand Down Expand Up @@ -372,6 +374,7 @@ module I = struct
let pminud x y = emit (SSE41 (PMINUD (x, y)))
let roundpd i x y = emit (SSE41 (ROUNDPD (i, x, y)))
let roundps i x y = emit (SSE41 (ROUNDPS (i, x, y)))
let roundss i x y = emit (SSE41 (ROUNDSS (i, x, y)))
let mpsadbw i x y = emit (SSE41 (MPSADBW (i, x, y)))
let phminposuw x y = emit (SSE41 (PHMINPOSUW (x, y)))
let pmulld x y = emit (SSE41 (PMULLD (x, y)))
Expand Down
Loading
Loading