From 12ba4d92a9901a74e07a6d38d7516298459a17e5 Mon Sep 17 00:00:00 2001
From: Max Slater <max@thenumb.at>
Date: Thu, 13 Jun 2024 12:07:01 -0400
Subject: [PATCH] Float32 min/max/rounding intrinsics (#2684)

---
 backend/amd64/emit.mlp                    | 15 +++-
 backend/amd64/simd.ml                     | 91 ++++++++++++++---------
 backend/amd64/simd_proc.ml                | 13 ++--
 backend/amd64/simd_selection.ml           | 15 +++-
 backend/x86_ast.mli                       |  3 +
 backend/x86_binary_emitter.ml             | 18 +++++
 backend/x86_dsl.ml                        |  3 +
 backend/x86_dsl.mli                       |  3 +
 backend/x86_gas.ml                        |  3 +
 backend/x86_masm.ml                       |  3 +
 ocaml/otherlibs/stdlib_beta/float32.ml    | 28 +++++++
 ocaml/otherlibs/stdlib_beta/float32.mli   | 57 ++++++++++++--
 ocaml/otherlibs/stdlib_beta/float32_u.ml  | 16 ++++
 ocaml/otherlibs/stdlib_beta/float32_u.mli | 40 ++++++++++
 ocaml/runtime/float32.c                   | 45 +++++++++++
 ocaml/runtime4/float32.c                  | 45 +++++++++++
 tests/small_numbers/float32_lib.ml        | 17 +++++
 tests/small_numbers/float32_u_lib.ml      | 18 +++++
 tests/small_numbers/stubs.c               | 24 ++++++
 19 files changed, 404 insertions(+), 53 deletions(-)

diff --git a/backend/amd64/emit.mlp b/backend/amd64/emit.mlp
index 3cde513ca31..7a5213c0cf8 100644
--- a/backend/amd64/emit.mlp
+++ b/backend/amd64/emit.mlp
@@ -1086,6 +1086,13 @@ let emit_simd_instr op i =
   | CLMUL (Clmul_64 n) -> I.pclmulqdq (X86_dsl.int n) (arg i 1) (res i 0)
   | BMI2 Extract_64 -> I.pext (arg i 1) (arg i 0) (res i 0)
   | BMI2 Deposit_64 -> I.pdep (arg i 1) (arg i 0) (res i 0)
+  | SSE Round_current_f32_i64 -> I.cvtss2si (arg i 0) (res i 0)
+  | SSE Sqrt_scalar_f32 ->
+    if arg i 0 <> res i 0 then
+      I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
+    I.sqrtss (arg i 0) (res i 0)
+  | SSE Max_scalar_f32 -> I.maxss (arg i 1) (res i 0)
+  | SSE Min_scalar_f32 -> I.minss (arg i 1) (res i 0)
   | SSE (Cmp_f32 n) -> I.cmpps n (arg i 1) (res i 0)
   | SSE Add_f32 -> I.addps (arg i 1) (res i 0)
   | SSE Sub_f32 -> I.subps (arg i 1) (res i 0)
@@ -1109,10 +1116,6 @@ let emit_simd_instr op i =
     if arg i 0 <> res i 0 then
       I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
     I.sqrtsd (arg i 0) (res i 0)
-  | SSE2 Sqrt_scalar_f32 ->
-    if arg i 0 <> res i 0 then
-      I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
-    I.sqrtss (arg i 0) (res i 0)
   | SSE2 Sqrt_f64 -> I.sqrtpd (arg i 0) (res i 0)
   | SSE2 Add_i8 -> I.paddb (arg i 1) (res i 0)
   | SSE2 Add_i16 -> I.paddw (arg i 1) (res i 0)
@@ -1271,6 +1274,10 @@ let emit_simd_instr op i =
     if arg i 0 <> res i 0 then
       I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
     I.roundsd n (arg i 0) (res i 0)
+  | SSE41 (Round_scalar_f32 n) ->
+    if arg i 0 <> res i 0 then
+      I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
+    I.roundss n (arg i 0) (res i 0)
   | SSE41 (Round_f64 n) -> I.roundpd n (arg i 0) (res i 0)
   | SSE41 (Round_f32 n) -> I.roundps n (arg i 0) (res i 0)
   | SSE41 (Multi_sad_unsigned_i8 n) -> I.mpsadbw (X86_dsl.int n) (arg i 1) (res i 0)
diff --git a/backend/amd64/simd.ml b/backend/amd64/simd.ml
index d64c2e67576..0828956b1de 100644
--- a/backend/amd64/simd.ml
+++ b/backend/amd64/simd.ml
@@ -68,6 +68,10 @@ type bmi2_operation =
   | Extract_64
 
 type sse_operation =
+  | Round_current_f32_i64
+  | Sqrt_scalar_f32
+  | Min_scalar_f32
+  | Max_scalar_f32
   | Cmp_f32 of float_condition
   | Add_f32
   | Sub_f32
@@ -89,7 +93,6 @@ type sse_operation =
 type sse2_operation =
   | Round_current_f64_i64
   | Sqrt_scalar_f64
-  | Sqrt_scalar_f32
   | Min_scalar_f64
   | Max_scalar_f64
   | Sqrt_f64
@@ -207,6 +210,7 @@ type ssse3_operation =
 
 type sse41_operation =
   | Round_scalar_f64 of float_rounding
+  | Round_scalar_f32 of float_rounding
   | Blend_16 of int
   | Blend_32 of int
   | Blend_64 of int
@@ -289,6 +293,10 @@ let equal_operation_bmi2 l r =
 
 let equal_operation_sse l r =
   match l, r with
+  | Round_current_f32_i64, Round_current_f32_i64
+  | Sqrt_scalar_f32, Sqrt_scalar_f32
+  | Min_scalar_f32, Min_scalar_f32
+  | Max_scalar_f32, Max_scalar_f32
   | Add_f32, Add_f32
   | Sub_f32, Sub_f32
   | Mul_f32, Mul_f32
@@ -307,10 +315,11 @@ let equal_operation_sse l r =
     true
   | Cmp_f32 l, Cmp_f32 r when float_condition_equal l r -> true
   | Shuffle_32 l, Shuffle_32 r when Int.equal l r -> true
-  | ( ( Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32 | Rcp_f32
-      | Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64
-      | Interleave_high_32 | Interleave_low_32_regs | Interleave_low_32
-      | Movemask_32 | Cmp_f32 _ | Shuffle_32 _ ),
+  | ( ( Round_current_f32_i64 | Sqrt_scalar_f32 | Min_scalar_f32
+      | Max_scalar_f32 | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32
+      | Min_f32 | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64
+      | Low_64_to_high_64 | Interleave_high_32 | Interleave_low_32_regs
+      | Interleave_low_32 | Movemask_32 | Cmp_f32 _ | Shuffle_32 _ ),
       _ ) ->
     false
 
@@ -320,7 +329,6 @@ let equal_operation_sse2 l r =
   | Min_scalar_f64, Min_scalar_f64
   | Max_scalar_f64, Max_scalar_f64
   | Sqrt_scalar_f64, Sqrt_scalar_f64
-  | Sqrt_scalar_f32, Sqrt_scalar_f32
   | Sqrt_f64, Sqrt_f64
   | Add_i8, Add_i8
   | Add_i16, Add_i16
@@ -409,26 +417,25 @@ let equal_operation_sse2 l r =
     true
   | Cmp_f64 l, Cmp_f64 r when float_condition_equal l r -> true
   | ( ( Add_i8 | Add_i16 | Add_i32 | Add_i64 | Add_f64 | Min_scalar_f64
-      | Max_scalar_f64 | Round_current_f64_i64 | Sqrt_scalar_f64
-      | Sqrt_scalar_f32 | Sqrt_f64 | Add_saturating_unsigned_i8
-      | Add_saturating_unsigned_i16 | Add_saturating_i8 | Add_saturating_i16
-      | Sub_i8 | Sub_i16 | Sub_i32 | Sub_i64 | Sub_f64
-      | Sub_saturating_unsigned_i8 | Sub_saturating_unsigned_i16
-      | Sub_saturating_i8 | Sub_saturating_i16 | Max_unsigned_i8 | Max_i16
-      | Max_f64 | Min_unsigned_i8 | Min_i16 | Min_f64 | Mul_f64 | Div_f64
-      | And_bits | Andnot_bits | Or_bits | Xor_bits | Movemask_8 | Movemask_64
-      | Cmpeq_i8 | Cmpeq_i16 | Cmpeq_i32 | Cmpgt_i8 | Cmpgt_i16 | Cmpgt_i32
-      | I32_to_f64 | I32_to_f32 | F64_to_i32 | F64_to_f32 | F32_to_i32
-      | F32_to_f64 | SLL_i16 | SLL_i32 | SLL_i64 | SRL_i16 | SRL_i32 | SRL_i64
-      | SRA_i16 | SRA_i32 | I16_to_i8 | I32_to_i16 | I16_to_unsigned_i8
-      | I32_to_unsigned_i16 | Avg_unsigned_i8 | Avg_unsigned_i16
-      | SAD_unsigned_i8 | Interleave_high_8 | Interleave_high_16
-      | Interleave_high_64 | Interleave_low_8 | Interleave_low_16
-      | Interleave_low_64 | SLLi_i16 _ | SLLi_i32 _ | SLLi_i64 _ | SRLi_i16 _
-      | SRLi_i32 _ | SRLi_i64 _ | SRAi_i16 _ | SRAi_i32 _ | Shift_left_bytes _
-      | Shift_right_bytes _ | Cmp_f64 _ | Shuffle_64 _ | Shuffle_high_16 _
-      | Shuffle_low_16 _ | Mulhi_i16 | Mulhi_unsigned_i16 | Mullo_i16
-      | Mul_hadd_i16_to_i32 ),
+      | Max_scalar_f64 | Round_current_f64_i64 | Sqrt_scalar_f64 | Sqrt_f64
+      | Add_saturating_unsigned_i8 | Add_saturating_unsigned_i16
+      | Add_saturating_i8 | Add_saturating_i16 | Sub_i8 | Sub_i16 | Sub_i32
+      | Sub_i64 | Sub_f64 | Sub_saturating_unsigned_i8
+      | Sub_saturating_unsigned_i16 | Sub_saturating_i8 | Sub_saturating_i16
+      | Max_unsigned_i8 | Max_i16 | Max_f64 | Min_unsigned_i8 | Min_i16
+      | Min_f64 | Mul_f64 | Div_f64 | And_bits | Andnot_bits | Or_bits
+      | Xor_bits | Movemask_8 | Movemask_64 | Cmpeq_i8 | Cmpeq_i16 | Cmpeq_i32
+      | Cmpgt_i8 | Cmpgt_i16 | Cmpgt_i32 | I32_to_f64 | I32_to_f32 | F64_to_i32
+      | F64_to_f32 | F32_to_i32 | F32_to_f64 | SLL_i16 | SLL_i32 | SLL_i64
+      | SRL_i16 | SRL_i32 | SRL_i64 | SRA_i16 | SRA_i32 | I16_to_i8 | I32_to_i16
+      | I16_to_unsigned_i8 | I32_to_unsigned_i16 | Avg_unsigned_i8
+      | Avg_unsigned_i16 | SAD_unsigned_i8 | Interleave_high_8
+      | Interleave_high_16 | Interleave_high_64 | Interleave_low_8
+      | Interleave_low_16 | Interleave_low_64 | SLLi_i16 _ | SLLi_i32 _
+      | SLLi_i64 _ | SRLi_i16 _ | SRLi_i32 _ | SRLi_i64 _ | SRAi_i16 _
+      | SRAi_i32 _ | Shift_left_bytes _ | Shift_right_bytes _ | Cmp_f64 _
+      | Shuffle_64 _ | Shuffle_high_16 _ | Shuffle_low_16 _ | Mulhi_i16
+      | Mulhi_unsigned_i16 | Mullo_i16 | Mul_hadd_i16_to_i32 ),
       _ ) ->
     false
 
@@ -521,6 +528,7 @@ let equal_operation_sse41 l r =
     when Int.equal l r ->
     true
   | Round_scalar_f64 l, Round_scalar_f64 r
+  | Round_scalar_f32 l, Round_scalar_f32 r
   | Round_f64 l, Round_f64 r
   | Round_f32 l, Round_f32 r
     when float_rounding_equal l r ->
@@ -533,7 +541,7 @@ let equal_operation_sse41 l r =
       | Blend_16 _ | Blend_32 _ | Blend_64 _ | Dp_f32 _ | Dp_f64 _ | Mullo_i32
       | Extract_i8 _ | Extract_i16 _ | Extract_i32 _ | Extract_i64 _
       | Insert_i8 _ | Insert_i16 _ | Insert_i32 _ | Insert_i64 _ | Round_f64 _
-      | Round_scalar_f64 _ | Round_f32 _ ),
+      | Round_scalar_f64 _ | Round_scalar_f32 _ | Round_f32 _ ),
       _ ) ->
     false
 
@@ -607,6 +615,13 @@ let print_operation_bmi2 printreg op ppf arg =
 
 let print_operation_sse printreg op ppf arg =
   match op with
+  | Round_current_f32_i64 ->
+    fprintf ppf "round_current_f32_i64 %a" printreg arg.(0)
+  | Sqrt_scalar_f32 -> fprintf ppf "sqrt_scalar_f32 %a" printreg arg.(0)
+  | Min_scalar_f32 ->
+    fprintf ppf "min_scalar_f32 %a %a" printreg arg.(0) printreg arg.(1)
+  | Max_scalar_f32 ->
+    fprintf ppf "max_scalar_f32 %a %a" printreg arg.(0) printreg arg.(1)
   | Cmp_f32 i ->
     fprintf ppf "cmp_f32[%a] %a %a" print_float_condition i printreg arg.(0)
       printreg arg.(1)
@@ -636,7 +651,6 @@ let print_operation_sse printreg op ppf arg =
 let print_operation_sse2 printreg op ppf arg =
   match op with
   | Sqrt_scalar_f64 -> fprintf ppf "sqrt_scalar_f64 %a" printreg arg.(0)
-  | Sqrt_scalar_f32 -> fprintf ppf "sqrt_scalar_f32 %a" printreg arg.(0)
   | Min_scalar_f64 ->
     fprintf ppf "min_scalar_f64 %a %a" printreg arg.(0) printreg arg.(1)
   | Max_scalar_f64 ->
@@ -853,6 +867,9 @@ let print_operation_sse41 printreg op ppf arg =
   | Round_scalar_f64 i ->
     fprintf ppf "round_scalar_f64[%a] %a" print_float_rounding i printreg
       arg.(0)
+  | Round_scalar_f32 i ->
+    fprintf ppf "round_scalar_f32[%a] %a" print_float_rounding i printreg
+      arg.(0)
   | Round_f64 i ->
     fprintf ppf "round_f64[%a] %a" print_float_rounding i printreg arg.(0)
   | Round_f32 i ->
@@ -912,18 +929,20 @@ let class_of_operation_clmul = function Clmul_64 _ -> Pure
 let class_of_operation_bmi2 = function Deposit_64 | Extract_64 -> Pure
 
 let class_of_operation_sse = function
-  | Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
-  | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64
-  | Interleave_high_32 | Interleave_low_32 | Interleave_low_32_regs
-  | Movemask_32 | Shuffle_32 _ ->
+  | Round_current_f32_i64
+  (* CR-someday mslater: (SIMD) reads current rounding mode *)
+  | Sqrt_scalar_f32 | Min_scalar_f32 | Max_scalar_f32 | Cmp_f32 _ | Add_f32
+  | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32 | Rcp_f32 | Sqrt_f32
+  | Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64 | Interleave_high_32
+  | Interleave_low_32 | Interleave_low_32_regs | Movemask_32 | Shuffle_32 _ ->
     Pure
 
 let class_of_operation_sse2 = function
   | Round_current_f64_i64
     (* CR-someday mslater: (SIMD) reads current rounding mode *)
   | Add_i8 | Add_i16 | Add_i32 | Add_i64 | Add_f64 | Add_saturating_i8
-  | Min_scalar_f64 | Max_scalar_f64 | Sqrt_scalar_f64 | Sqrt_scalar_f32
-  | Sqrt_f64 | Add_saturating_i16 | Add_saturating_unsigned_i8
+  | Min_scalar_f64 | Max_scalar_f64 | Sqrt_scalar_f64 | Sqrt_f64
+  | Add_saturating_i16 | Add_saturating_unsigned_i8
   | Add_saturating_unsigned_i16 | Sub_i8 | Sub_i16 | Sub_i32 | Sub_i64 | Sub_f64
   | Sub_saturating_i8 | Sub_saturating_i16 | Sub_saturating_unsigned_i8
   | Sub_saturating_unsigned_i16 | Max_unsigned_i8 | Max_i16 | Max_f64
@@ -962,8 +981,8 @@ let class_of_operation_sse41 = function
   | Extract_i32 _ | Extract_i64 _ | Insert_i8 _ | Insert_i16 _ | Insert_i32 _
   | Insert_i64 _ | Max_i8 | Max_i32 | Max_unsigned_i16 | Max_unsigned_i32
   | Min_i8 | Min_i32 | Min_unsigned_i16 | Min_unsigned_i32 | Round_f64 _
-  | Round_scalar_f64 _ | Round_f32 _ | Multi_sad_unsigned_i8 _
-  | Minpos_unsigned_i16 | Mullo_i32 ->
+  | Round_scalar_f64 _ | Round_scalar_f32 _ | Round_f32 _
+  | Multi_sad_unsigned_i8 _ | Minpos_unsigned_i16 | Mullo_i32 ->
     Pure
 
 let class_of_operation_sse42 = function
diff --git a/backend/amd64/simd_proc.ml b/backend/amd64/simd_proc.ml
index 24ba56b241e..e8a752cb31a 100644
--- a/backend/amd64/simd_proc.ml
+++ b/backend/amd64/simd_proc.ml
@@ -39,10 +39,12 @@ let register_behavior_clmul = function Clmul_64 _ -> R_RM_to_fst
 let register_behavior_bmi2 = function Extract_64 | Deposit_64 -> R_RM_to_R
 
 let register_behavior_sse = function
-  | Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
-  | Interleave_low_32 | Interleave_high_32 | Shuffle_32 _ ->
+  | Min_scalar_f32 | Max_scalar_f32 | Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32
+  | Div_f32 | Max_f32 | Min_f32 | Interleave_low_32 | Interleave_high_32
+  | Shuffle_32 _ ->
     R_RM_to_fst
-  | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 -> RM_to_R
+  | Round_current_f32_i64 | Sqrt_scalar_f32 | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 ->
+    RM_to_R
   | Interleave_low_32_regs | High_64_to_low_64 | Low_64_to_high_64 -> R_R_to_fst
   | Movemask_32 -> R_to_R
 
@@ -64,7 +66,7 @@ let register_behavior_sse2 = function
     R_RM_to_fst
   | Shuffle_high_16 _ | Shuffle_low_16 _ | I32_to_f64 | I32_to_f32 | F64_to_i32
   | Round_current_f64_i64 | F64_to_f32 | F32_to_i32 | F32_to_f64 | Sqrt_f64
-  | Sqrt_scalar_f64 | Sqrt_scalar_f32 ->
+  | Sqrt_scalar_f64 ->
     RM_to_R
   | SLLi_i16 _ | SLLi_i32 _ | SLLi_i64 _ | SRLi_i16 _ | SRLi_i32 _ | SRLi_i64 _
   | SRAi_i16 _ | SRAi_i32 _ | Shift_left_bytes _ | Shift_right_bytes _ ->
@@ -91,7 +93,8 @@ let register_behavior_sse41 = function
     R_RM_to_fst
   | I8_sx_i16 | I8_sx_i32 | I8_sx_i64 | I16_sx_i32 | I16_sx_i64 | I32_sx_i64
   | I8_zx_i16 | I8_zx_i32 | I8_zx_i64 | I16_zx_i32 | I16_zx_i64 | I32_zx_i64
-  | Round_f64 _ | Round_f32 _ | Minpos_unsigned_i16 | Round_scalar_f64 _ ->
+  | Round_f64 _ | Round_f32 _ | Minpos_unsigned_i16 | Round_scalar_f64 _
+  | Round_scalar_f32 _ ->
     RM_to_R
   | Blendv_8 | Blendv_32 | Blendv_64 -> R_RM_xmm0_to_fst
   | Extract_i64 _ | Extract_i32 _ -> R_to_RM
diff --git a/backend/amd64/simd_selection.ml b/backend/amd64/simd_selection.ml
index a054a219c50..f6cf760eeca 100644
--- a/backend/amd64/simd_selection.ml
+++ b/backend/amd64/simd_selection.ml
@@ -78,6 +78,10 @@ let select_operation_bmi2 op args =
 
 let select_operation_sse op args =
   match op with
+  | "caml_sse_float32_sqrt" | "sqrtf" -> Some (Sqrt_scalar_f32, args)
+  | "caml_sse_float32_max" -> Some (Max_scalar_f32, args)
+  | "caml_sse_float32_min" -> Some (Min_scalar_f32, args)
+  | "caml_sse_cast_float32_int64" -> Some (Round_current_f32_i64, args)
   | "caml_sse_float32x4_cmp" ->
     let i, args = extract_constant args ~max:7 op in
     Some (Cmp_f32 (float_condition_of_int i), args)
@@ -103,7 +107,6 @@ let select_operation_sse op args =
 let select_operation_sse2 op args =
   match op with
   | "caml_sse2_float64_sqrt" | "sqrt" -> Some (Sqrt_scalar_f64, args)
-  | "caml_sse2_float32_sqrt" | "sqrtf" -> Some (Sqrt_scalar_f32, args)
   | "caml_sse2_float64_max" -> Some (Max_scalar_f64, args)
   | "caml_sse2_float64_min" -> Some (Min_scalar_f64, args)
   | "caml_sse2_cast_float64_int64" -> Some (Round_current_f64_i64, args)
@@ -335,8 +338,18 @@ let select_operation_sse41 op args =
       let i, args = extract_constant args ~max:15 op in
       Some (Round_f64 (float_rounding_of_int i), args)
     | "caml_sse41_float64_round" ->
+      (* CR-someday mslater: the following CR also applies here, but this
+         builtin is not exposed by any of the stdlib libraries. *)
       let i, args = extract_constant args ~max:15 op in
       Some (Round_scalar_f64 (float_rounding_of_int i), args)
+    | "caml_sse41_float32_round" ->
+      (* CR-someday mslater: this builtin is exposed by float32.ml, so must
+         actually be cross-platform. Currently, non-amd64 architectures will
+         fall back to a C implementation. If we want the arm64 backend to
+         specialize it, we should redefine the constant mapping from the amd64
+         values to a new sum type. *)
+      let i, args = extract_constant args ~max:15 op in
+      Some (Round_scalar_f32 (float_rounding_of_int i), args)
     | "caml_sse41_int8x16_max" -> Some (Max_i8, args)
     | "caml_sse41_int32x4_max" -> Some (Max_i32, args)
     | "caml_sse41_int16x8_max_unsigned" -> Some (Max_unsigned_i16, args)
diff --git a/backend/x86_ast.mli b/backend/x86_ast.mli
index a200c0b43e4..a19a3d20362 100644
--- a/backend/x86_ast.mli
+++ b/backend/x86_ast.mli
@@ -109,6 +109,8 @@ type arg =
   | Mem64_RIP of data_type * string * int
 
 type sse_instruction =
+  | MINSS of arg * arg
+  | MAXSS of arg * arg
   | CMPPS of float_condition * arg * arg
   | SHUFPS of arg * arg * arg
   | ADDPS of arg * arg
@@ -280,6 +282,7 @@ type sse41_instruction =
   | PMINUD of arg * arg
   | ROUNDPD of rounding * arg * arg
   | ROUNDPS of rounding * arg * arg
+  | ROUNDSS of rounding * arg * arg
   | MPSADBW of arg * arg * arg
   | PHMINPOSUW of arg * arg
   | PMULLD of arg * arg
diff --git a/backend/x86_binary_emitter.ml b/backend/x86_binary_emitter.ml
index 097a3d42210..f6c08a9ac56 100644
--- a/backend/x86_binary_emitter.ml
+++ b/backend/x86_binary_emitter.ml
@@ -1091,6 +1091,7 @@ let emit_dpps = suffix emit_osize_rf_rfm_3A 0x40
 let emit_dppd = suffix emit_osize_rf_rfm_3A 0x41
 let emit_roundps = suffix emit_osize_rf_rfm_3A 0x08
 let emit_roundpd = suffix emit_osize_rf_rfm_3A 0x09
+let emit_roundss = suffix emit_osize_rf_rfm_3A 0x0A
 
 let emit_pmulhw = emit_osize_rf_rfm 0xE5
 let emit_pmulhuw = emit_osize_rf_rfm 0xE4
@@ -1754,6 +1755,20 @@ let emit_mfence b = buf_opcodes b [ 0x0F; 0xAE; 0xF0 ]
 
 let emit_leave b = buf_int8 b 0xC9
 
+let emit_maxss b ~dst ~src =
+  match (dst, src) with
+  | (Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm)) ->
+    buf_int8 b 0xF3;
+    emit_mod_rm_reg b no_rex [ 0x0F; 0x5F ] rm (rd_of_regf reg)
+  | _ -> assert false
+
+let emit_minss b ~dst ~src =
+  match (dst, src) with
+  | (Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm)) ->
+    buf_int8 b 0xF3;
+    emit_mod_rm_reg b no_rex [ 0x0F; 0x5D ] rm (rd_of_regf reg)
+  | _ -> assert false
+
 let emit_maxsd b ~dst ~src =
   match (dst, src) with
   | (Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm)) ->
@@ -1933,6 +1948,8 @@ let assemble_instr b loc = function
   | XORPS (src, dst) -> emit_xor_float ~width:Cmm.Float32 b dst src
   | ANDPS (src, dst) -> emit_and_float ~width:Cmm.Float32 b dst src
   | CMPSS (condition, src, dst) -> emit_cmp_float ~width:Cmm.Float32 b ~condition ~dst ~src
+  | SSE MINSS (src, dst) -> emit_minss b ~dst ~src
+  | SSE MAXSS (src, dst) -> emit_maxss b ~dst ~src
   | SSE CMPPS (cmp, src, dst) -> emit_cmpps b (imm8_of_float_condition cmp) dst src
   | SSE ADDPS (src, dst) -> emit_addps b dst src
   | SSE SUBPS (src, dst) -> emit_subps b dst src
@@ -2095,6 +2112,7 @@ let assemble_instr b loc = function
   | SSE41 PMINUD (src, dst) -> emit_pminud b dst src
   | SSE41 ROUNDPD (n, src, dst) -> emit_roundpd b (imm8_of_rounding n) dst src
   | SSE41 ROUNDPS (n, src, dst) -> emit_roundps b (imm8_of_rounding n) dst src
+  | SSE41 ROUNDSS (n, src, dst) -> emit_roundss b (imm8_of_rounding n) dst src
   | SSE41 PHMINPOSUW (src, dst) -> emit_phminposuw b dst src
   | SSE41 PMULLD (src, dst) -> emit_pmulld b dst src
   | SSE41 MPSADBW (n, src, dst) -> emit_mpsadbw b (imm n) dst src
diff --git a/backend/x86_dsl.ml b/backend/x86_dsl.ml
index ab9d37c6e0f..bd4cdadfe87 100644
--- a/backend/x86_dsl.ml
+++ b/backend/x86_dsl.ml
@@ -205,6 +205,8 @@ module I = struct
   let andps x y = emit (ANDPS (x, y))
   let cmpss i x y = emit (CMPSS (i, x, y))
 
+  let minss x y = emit (SSE (MINSS (x, y)))
+  let maxss x y = emit (SSE (MAXSS (x, y)))
   let cmpps i x y = emit (SSE (CMPPS (i, x, y)))
   let shufps i x y = emit (SSE (SHUFPS (i, x, y)))
   let addps x y = emit (SSE (ADDPS (x, y)))
@@ -372,6 +374,7 @@ module I = struct
   let pminud x y = emit (SSE41 (PMINUD (x, y)))
   let roundpd i x y = emit (SSE41 (ROUNDPD (i, x, y)))
   let roundps i x y = emit (SSE41 (ROUNDPS (i, x, y)))
+  let roundss i x y = emit (SSE41 (ROUNDSS (i, x, y)))
   let mpsadbw i x y = emit (SSE41 (MPSADBW (i, x, y)))
   let phminposuw x y = emit (SSE41 (PHMINPOSUW (x, y)))
   let pmulld x y = emit (SSE41 (PMULLD (x, y)))
diff --git a/backend/x86_dsl.mli b/backend/x86_dsl.mli
index 58bef19032e..08152d9296b 100644
--- a/backend/x86_dsl.mli
+++ b/backend/x86_dsl.mli
@@ -200,6 +200,8 @@ module I : sig
 
   (* Float32 arithmetic *)
 
+  val minss: arg -> arg -> unit
+  val maxss: arg -> arg -> unit
   val addss: arg -> arg -> unit
   val subss: arg -> arg -> unit
   val mulss: arg -> arg -> unit
@@ -388,6 +390,7 @@ module I : sig
   val pminud: arg -> arg -> unit
   val roundpd: rounding -> arg -> arg -> unit
   val roundps: rounding -> arg -> arg -> unit
+  val roundss: rounding -> arg -> arg -> unit
   val mpsadbw: arg -> arg -> arg -> unit
   val phminposuw: arg -> arg -> unit
   val pmulld: arg -> arg -> unit
diff --git a/backend/x86_gas.ml b/backend/x86_gas.ml
index d8babc4f606..9bd0661b65f 100644
--- a/backend/x86_gas.ml
+++ b/backend/x86_gas.ml
@@ -219,6 +219,8 @@ let print_instr b = function
   | XORPS (arg1, arg2) -> i2 b "xorps" arg1 arg2
   | ANDPS (arg1, arg2) -> i2 b "andps" arg1 arg2
   | CMPSS (cmp, arg1, arg2) -> i2 b ("cmp" ^ string_of_float_condition cmp ^ "ss") arg1 arg2
+  | SSE MINSS (arg1, arg2) -> i2 b "minss" arg1 arg2
+  | SSE MAXSS (arg1, arg2) -> i2 b "maxss" arg1 arg2
   | SSE CMPPS (cmp, arg1, arg2) -> i2 b ("cmp" ^ string_of_float_condition cmp ^ "ps") arg1 arg2
   | SSE SHUFPS (shuf, arg1, arg2) -> i3 b "shufps" shuf arg1 arg2
   | SSE ADDPS (arg1, arg2) -> i2 b "addps" arg1 arg2
@@ -382,6 +384,7 @@ let print_instr b = function
   | SSE41 PMINUD (arg1, arg2) -> i2 b "pminud" arg1 arg2
   | SSE41 ROUNDPD (rd, arg1, arg2) -> i3 b "roundpd" (imm_of_rounding rd) arg1 arg2
   | SSE41 ROUNDPS (rd, arg1, arg2) -> i3 b "roundps" (imm_of_rounding rd) arg1 arg2
+  | SSE41 ROUNDSS (rd, arg1, arg2) -> i3 b "roundss" (imm_of_rounding rd) arg1 arg2
   | SSE41 MPSADBW (n, arg1, arg2) -> i3 b "mpsadbw" n arg1 arg2
   | SSE41 PHMINPOSUW (arg1, arg2) -> i2 b "phminposuw" arg1 arg2
   | SSE41 PMULLD (arg1, arg2) -> i2 b "pmulld" arg1 arg2
diff --git a/backend/x86_masm.ml b/backend/x86_masm.ml
index ed9527b2331..a510690c351 100644
--- a/backend/x86_masm.ml
+++ b/backend/x86_masm.ml
@@ -221,6 +221,8 @@ let print_instr b = function
   | CMPSS (cmp, arg1, arg2) -> i2 b ("cmp" ^ string_of_float_condition cmp ^ "ss") arg1 arg2
   | SSE CMPPS (cmp, arg1, arg2) -> i2 b ("cmp" ^ string_of_float_condition cmp ^ "ps") arg1 arg2
   | SSE SHUFPS (shuf, arg1, arg2) -> i3 b "shufps" shuf arg1 arg2
+  | SSE MINSS (arg1, arg2) -> i2 b "minss" arg1 arg2
+  | SSE MAXSS (arg1, arg2) -> i2 b "maxss" arg1 arg2
   | SSE ADDPS (arg1, arg2) -> i2 b "addps" arg1 arg2
   | SSE SUBPS (arg1, arg2) -> i2 b "subps" arg1 arg2
   | SSE MULPS (arg1, arg2) -> i2 b "mulps" arg1 arg2
@@ -382,6 +384,7 @@ let print_instr b = function
   | SSE41 PMINUD (arg1, arg2) -> i2 b "pminud" arg1 arg2
   | SSE41 ROUNDPD (rd, arg1, arg2) -> i3 b "roundpd" (imm_of_rounding rd) arg1 arg2
   | SSE41 ROUNDPS (rd, arg1, arg2) -> i3 b "roundps" (imm_of_rounding rd) arg1 arg2
+  | SSE41 ROUNDSS (rd, arg1, arg2) -> i3 b "roundss" (imm_of_rounding rd) arg1 arg2
   | SSE41 MPSADBW (n, arg1, arg2) -> i3 b "mpsadbw" n arg1 arg2
   | SSE41 PHMINPOSUW (arg1, arg2) -> i2 b "phminposuw" arg1 arg2
   | SSE41 PMULLD (arg1, arg2) -> i2 b "pmulld" arg1 arg2
diff --git a/ocaml/otherlibs/stdlib_beta/float32.ml b/ocaml/otherlibs/stdlib_beta/float32.ml
index 949a17ef321..bfa26e8d864 100644
--- a/ocaml/otherlibs/stdlib_beta/float32.ml
+++ b/ocaml/otherlibs/stdlib_beta/float32.ml
@@ -246,6 +246,16 @@ let[@inline] max (x : t) (y : t) =
   else if is_nan y then y
   else x
 
+module With_weird_nan_behavior = struct
+  external min : t -> t -> t
+    = "caml_sse_float32_min_bytecode" "caml_sse_float32_min"
+    [@@noalloc] [@@unboxed] [@@builtin]
+
+  external max : t -> t -> t
+    = "caml_sse_float32_max_bytecode" "caml_sse_float32_max"
+    [@@noalloc] [@@unboxed] [@@builtin]
+end
+
 let[@inline] min_max (x : t) (y : t) =
   if is_nan x || is_nan y then (nan, nan)
   else if y > x || ((not (sign_bit y)) && sign_bit x) then (x, y)
@@ -267,6 +277,24 @@ let[@inline] min_max_num (x : t) (y : t) =
   else if y > x || ((not (sign_bit y)) && sign_bit x) then (x, y)
   else (y, x)
 
+external iround_half_to_even : t -> int64
+  = "caml_sse_cast_float32_int64_bytecode" "caml_sse_cast_float32_int64"
+  [@@noalloc] [@@unboxed] [@@builtin]
+
+external round_intrinsic : (int[@untagged]) -> (t[@unboxed]) -> (t[@unboxed])
+  = "caml_sse41_float32_round_bytecode" "caml_sse41_float32_round"
+  [@@noalloc] [@@builtin]
+
+(* On amd64, these constants also imply _MM_FROUND_NO_EXC (suppress exceptions). *)
+let round_neg_inf = 0x9
+let round_pos_inf = 0xA
+let round_zero = 0xB
+let round_current_mode = 0xC
+let[@inline] round_half_to_even x = round_intrinsic round_current_mode x
+let[@inline] round_down x = round_intrinsic round_neg_inf x
+let[@inline] round_up x = round_intrinsic round_pos_inf x
+let[@inline] round_towards_zero x = round_intrinsic round_zero x
+
 external seeded_hash_param : int -> int -> int -> 'a -> int = "caml_hash_exn"
   [@@noalloc]
 
diff --git a/ocaml/otherlibs/stdlib_beta/float32.mli b/ocaml/otherlibs/stdlib_beta/float32.mli
index 3f90464182b..10fa57c795b 100644
--- a/ocaml/otherlibs/stdlib_beta/float32.mli
+++ b/ocaml/otherlibs/stdlib_beta/float32.mli
@@ -374,17 +374,14 @@ external erfc : t -> t = "caml_erfc_float32_bytecode" "erfcf"
 external trunc : t -> t = "caml_trunc_float32_bytecode" "truncf"
   [@@unboxed] [@@noalloc]
 (** [trunc x] rounds [x] to the nearest integer whose absolute value is
-   less than or equal to [x]. *)
+    less than or equal to [x]. *)
 
 external round : t -> t = "caml_round_float32_bytecode" "roundf"
   [@@unboxed] [@@noalloc]
 (** [round x] rounds [x] to the nearest integer with ties (fractional
-   values of 0.5s) rounded away from zero, regardless of the current
-   rounding direction.  If [x] is an integer, [+0.s], [-0.s], [nan], or
-   infinite, [x] itself is returned.
-
-   On 64-bit mingw-w64, this function may be emulated owing to a bug in the
-   C runtime library (CRT) on this platform. *)
+    values of 0.5s) rounded away from zero, regardless of the current
+    rounding direction.  If [x] is an integer, [+0.s], [-0.s], [nan], or
+    infinite, [x] itself is returned. *)
 
 external ceil : t -> t = "caml_ceil_float32_bytecode" "ceilf"
   [@@unboxed] [@@noalloc]
@@ -461,6 +458,24 @@ val max : t -> t -> t
 (** [max x y] returns the maximum of [x] and [y].  It returns [nan]
    when [x] or [y] is [nan].  Moreover [max (-0.s) (+0.s) = +0.s] *)
 
+module With_weird_nan_behavior : sig
+  external min : t -> t -> t
+    = "caml_sse_float32_min_bytecode" "caml_sse_float32_min"
+    [@@noalloc] [@@unboxed] [@@builtin]
+  (** [min x y] returns the minimum of [x] and [y].
+      If either [x] or [y] is [nan], [y] is returned.
+      If both [x] and [y] equal zero, [y] is returned.
+      The amd64 flambda-backend compiler translates this call to MINSS. *)
+
+  external max : t -> t -> t
+    = "caml_sse_float32_max_bytecode" "caml_sse_float32_max"
+    [@@noalloc] [@@unboxed] [@@builtin]
+  (** [max x y] returns the maximum of [x] and [y].
+      If either [x] or [y] is [nan], [y] is returned.
+      If both [x] and [y] equal zero, [y] is returned.
+      The amd64 flambda-backend compiler translates this call to MAXSS. *)
+end
+
 val min_max : t -> t -> t * t
 (** [min_max x y] is [(min x y, max x y)], just more efficient. *)
 
@@ -479,6 +494,34 @@ val min_max_num : t -> t -> t * t
    efficient.  Note that in particular [min_max_num x nan = (x, x)]
    and [min_max_num nan y = (y, y)]. *)
 
+external iround_half_to_even : t -> int64
+  = "caml_sse_cast_float32_int64_bytecode" "caml_sse_cast_float32_int64"
+  [@@noalloc] [@@unboxed] [@@builtin]
+(** Rounds a [float32] to an [int64] using the current rounding mode. The default
+    rounding mode is "round half to even", and we expect that no program will
+    change the rounding mode.
+    If the argument is NaN or infinite or if the rounded value cannot be
+    represented, then the result is unspecified.
+    The amd64 flambda-backend compiler translates this call to CVTSS2SI. *)
+
+val round_half_to_even : t -> t
+(** Rounds a [float32] to an integer [float32] using the current rounding
+    mode.  The default rounding mode is "round half to even", and we
+    expect that no program will change the rounding mode.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS. *)
+
+val round_down : t -> t
+(** Rounds a [float32] down to the next integer [float32] toward negative infinity.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
+val round_up : t -> t
+(** Rounds a [float32] up to the next integer [float32] toward positive infinity.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
+val round_towards_zero : t -> t
+(** Rounds a [float32] to the next integer [float32] toward zero.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
 val seeded_hash : int -> t -> int
 (** A seeded hash function for floats, with the same output value as
     {!Hashtbl.seeded_hash}. This function allows this module to be passed as
diff --git a/ocaml/otherlibs/stdlib_beta/float32_u.ml b/ocaml/otherlibs/stdlib_beta/float32_u.ml
index 5fc35f491c6..8ae6fcea3d5 100644
--- a/ocaml/otherlibs/stdlib_beta/float32_u.ml
+++ b/ocaml/otherlibs/stdlib_beta/float32_u.ml
@@ -184,6 +184,22 @@ let[@inline always] min x y = of_float32 (Float32.min (to_float32 x) (to_float32
 
 let[@inline always] max x y = of_float32 (Float32.max (to_float32 x) (to_float32 y))
 
+module With_weird_nan_behavior = struct
+  let[@inline always] min x y = of_float32 (Float32.With_weird_nan_behavior.min (to_float32 x) (to_float32 y))
+
+  let[@inline always] max x y = of_float32 (Float32.With_weird_nan_behavior.max (to_float32 x) (to_float32 y))
+end
+
 let[@inline always] min_num x y = of_float32 (Float32.min_num (to_float32 x) (to_float32 y))
 
 let[@inline always] max_num x y = of_float32 (Float32.max_num (to_float32 x) (to_float32 y))
+
+let iround_half_to_even x = unbox_int64 (Float32.iround_half_to_even (to_float32 x))
+
+let round_half_to_even x = of_float32 (Float32.round_half_to_even (to_float32 x))
+
+let round_down x = of_float32 (Float32.round_down (to_float32 x))
+
+let round_up x = of_float32 (Float32.round_up (to_float32 x))
+
+let round_towards_zero x = of_float32 (Float32.round_towards_zero (to_float32 x))
diff --git a/ocaml/otherlibs/stdlib_beta/float32_u.mli b/ocaml/otherlibs/stdlib_beta/float32_u.mli
index 154755ee386..0ce0538626f 100644
--- a/ocaml/otherlibs/stdlib_beta/float32_u.mli
+++ b/ocaml/otherlibs/stdlib_beta/float32_u.mli
@@ -364,6 +364,20 @@ val max : t -> t -> t
 (** [max x y] returns the maximum of [x] and [y].  It returns [nan]
     when [x] or [y] is [nan].  Moreover [max #-0.s #+0.s = #+0.s] *)
 
+module With_weird_nan_behavior : sig
+    val min : t -> t -> t
+    (** [min x y] returns the minimum of [x] and [y].
+        If either [x] or [y] is [nan], [y] is returned.
+        If both [x] and [y] equal zero, [y] is returned.
+        The amd64 flambda-backend compiler translates this call to MINSS. *)
+
+    val max : t -> t -> t
+    (** [max x y] returns the maximum of [x] and [y].
+        If either [x] or [y] is [nan], [y] is returned.
+        If both [x] and [y] equal zero, [y] is returned.
+        The amd64 flambda-backend compiler translates this call to MAXSS. *)
+end
+
 val min_num : t -> t -> t
 (** [min_num x y] returns the minimum of [x] and [y] treating [nan] as
     missing values.  If both [x] and [y] are [nan], [nan] is returned.
@@ -374,5 +388,31 @@ val max_num : t -> t -> t
     missing values.  If both [x] and [y] are [nan] [nan] is returned.
     Moreover [max_num #-0.s #+0.s = #+0.s] *)
 
+val iround_half_to_even : t -> int64#
+(** Rounds a [float32#] to an [int64#] using the current rounding mode. The default
+    rounding mode is "round half to even", and we expect that no program will
+    change the rounding mode.
+    If the argument is NaN or infinite or if the rounded value cannot be
+    represented, then the result is unspecified.
+    The amd64 flambda-backend compiler translates this call to CVTSS2SI. *)
+
+val round_half_to_even : t -> t
+(** Rounds a [float32#] to an integer [float32#] using the current rounding
+    mode.  The default rounding mode is "round half to even", and we
+    expect that no program will change the rounding mode.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS. *)
+
+val round_down : t -> t
+(** Rounds a [float32#] down to the next integer [float32#] toward negative infinity.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
+val round_up : t -> t
+(** Rounds a [float32#] up to the next integer [float32#] toward positive infinity.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
+val round_towards_zero : t -> t
+(** Rounds a [float32#] to the next integer [float32#] toward zero.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
 (* CR layouts v5: add back hash when we deal with the ad-hoc polymorphic
    functions. *)
diff --git a/ocaml/runtime/float32.c b/ocaml/runtime/float32.c
index 58e029f9364..6dd946a3ab5 100644
--- a/ocaml/runtime/float32.c
+++ b/ocaml/runtime/float32.c
@@ -309,6 +309,51 @@ CAMLprim value caml_ldexp_float32_bytecode(value f, value i)
   return caml_copy_float32(caml_ldexp_float32(Float32_val(f), Int_val(i)));
 }
 
+float caml_sse_float32_min(float x, float y) {
+  return x < y ? x : y;
+}
+
+CAMLprim value caml_sse_float32_min_bytecode(value x, value y) {
+  return Float32_val(x) < Float32_val(y) ? x : y;
+}
+
+float caml_sse_float32_max(float x, float y) {
+  return x > y ? x : y;
+}
+
+CAMLprim value caml_sse_float32_max_bytecode(value x, value y) {
+  return Float32_val(x) > Float32_val(y) ? x : y;
+}
+
+int64_t caml_sse_cast_float32_int64(float f)
+{
+  return llrintf(f);
+}
+
+CAMLprim value caml_sse_cast_float32_int64_bytecode(value f)
+{
+  return caml_copy_int64(caml_sse_cast_float32_int64(Float32_val(f)));
+}
+
+#define ROUND_NEG_INF 0x9
+#define ROUND_POS_INF 0xA
+#define ROUND_ZERO 0xB
+#define ROUND_CURRENT 0xC
+
+float caml_sse41_float32_round(int mode, float f) {
+  switch(mode) {
+  case ROUND_NEG_INF: return floorf(f);
+  case ROUND_POS_INF: return ceilf(f);
+  case ROUND_ZERO:    return truncf(f);
+  case ROUND_CURRENT: return rintf(f);
+  default: caml_fatal_error("Unknown rounding mode.");
+  }
+}
+
+CAMLprim value caml_sse41_float32_round_bytecode(value mode, value f) {
+  return caml_copy_float32(caml_sse41_float32_round(Int_val(mode), Float32_val(f)));
+}
+
 enum { FP_normal, FP_subnormal, FP_zero, FP_infinite, FP_nan };
 
 value caml_classify_float32(float vf)
diff --git a/ocaml/runtime4/float32.c b/ocaml/runtime4/float32.c
index 931dd960978..228492e2540 100644
--- a/ocaml/runtime4/float32.c
+++ b/ocaml/runtime4/float32.c
@@ -309,6 +309,51 @@ CAMLprim value caml_ldexp_float32_bytecode(value f, value i)
   return caml_copy_float32(caml_ldexp_float32(Float32_val(f), Int_val(i)));
 }
 
+float caml_sse_float32_min(float x, float y) {
+  return x < y ? x : y;
+}
+
+CAMLprim value caml_sse_float32_min_bytecode(value x, value y) {
+  return Float32_val(x) < Float32_val(y) ? x : y;
+}
+
+float caml_sse_float32_max(float x, float y) {
+  return x > y ? x : y;
+}
+
+CAMLprim value caml_sse_float32_max_bytecode(value x, value y) {
+  return Float32_val(x) > Float32_val(y) ? x : y;
+}
+
+int64_t caml_sse_cast_float32_int64(float f)
+{
+  return llrintf(f);
+}
+
+CAMLprim value caml_sse_cast_float32_int64_bytecode(value f)
+{
+  return caml_copy_int64(caml_sse_cast_float32_int64(Float32_val(f)));
+}
+
+#define ROUND_NEG_INF 0x9
+#define ROUND_POS_INF 0xA
+#define ROUND_ZERO 0xB
+#define ROUND_CURRENT 0xC
+
+float caml_sse41_float32_round(int mode, float f) {
+  switch(mode) {
+  case ROUND_NEG_INF: return floorf(f);
+  case ROUND_POS_INF: return ceilf(f);
+  case ROUND_ZERO:    return truncf(f);
+  case ROUND_CURRENT: return rintf(f);
+  default: caml_fatal_error("Unknown rounding mode.");
+  }
+}
+
+CAMLprim value caml_sse41_float32_round_bytecode(value mode, value f) {
+  return caml_copy_float32(caml_sse41_float32_round(Int_val(mode), Float32_val(f)));
+}
+
 enum { FP_normal, FP_subnormal, FP_zero, FP_infinite, FP_nan };
 
 value caml_classify_float32(float vf)
diff --git a/tests/small_numbers/float32_lib.ml b/tests/small_numbers/float32_lib.ml
index 327e0c98f93..022a47ad2f1 100644
--- a/tests/small_numbers/float32_lib.ml
+++ b/tests/small_numbers/float32_lib.ml
@@ -102,11 +102,16 @@ module CF32 = struct
 
   external min : t -> t -> t = "float32_min_boxed"
   external max : t -> t -> t = "float32_max_boxed"
+  external min_weird : t -> t -> t = "float32_min_weird_boxed"
+  external max_weird : t -> t -> t = "float32_max_weird_boxed"
   external min_num : t -> t -> t = "float32_min_num_boxed"
   external max_num : t -> t -> t = "float32_max_num_boxed"
   external min_max : t -> t -> t * t = "float32_min_max_boxed"
   external min_max_num : t -> t -> t * t = "float32_min_max_num_boxed"
 
+  external round_current : t -> t = "float32_round_current_boxed"
+  external iround_current : t -> int64 = "float32_iround_current_boxed"
+
   external compare : t -> t -> int = "float32_compare_boxed" [@@noalloc]
   let equal x y = compare x y = 0
 
@@ -230,6 +235,8 @@ let () =
     bit_eq (F32.copy_sign f1 f2) (CF32.copy_sign f1 f2);
     bit_eq (F32.min f1 f2) (CF32.min f1 f2);
     bit_eq (F32.max f1 f2) (CF32.max f1 f2);
+    bit_eq (F32.With_weird_nan_behavior.min f1 f2) (CF32.min_weird f1 f2);
+    bit_eq (F32.With_weird_nan_behavior.max f1 f2) (CF32.max_weird f1 f2);
     bit_eq (F32.min_num f1 f2) (CF32.min_num f1 f2);
     bit_eq (F32.max_num f1 f2) (CF32.max_num f1 f2);
     assert((F32.compare f1 f2) = (CF32.compare f1 f2));
@@ -237,6 +244,16 @@ let () =
   )
 ;;
 
+let () =
+  CF32.check_float32s (fun f _ ->
+    bit_eq (F32.round_up f) (CF32.ceil f);
+    bit_eq (F32.round_down f) (CF32.floor f);
+    bit_eq (F32.round_half_to_even f) (CF32.round_current f);
+    (* Returns int64, so can compare directly. *)
+    assert ((F32.iround_half_to_even f) = (CF32.iround_current f));
+  )
+;;
+
 let () =
   CF32.check_float32s (fun f _ ->
     let m, i = F32.frexp f in
diff --git a/tests/small_numbers/float32_u_lib.ml b/tests/small_numbers/float32_u_lib.ml
index 787a5fadd30..3fbbe6aec55 100644
--- a/tests/small_numbers/float32_u_lib.ml
+++ b/tests/small_numbers/float32_u_lib.ml
@@ -109,11 +109,16 @@ module CF32 = struct
 
   external min : t -> t -> t = "float32_min_boxed"
   external max : t -> t -> t = "float32_max_boxed"
+  external min_weird : t -> t -> t = "float32_min_weird_boxed"
+  external max_weird : t -> t -> t = "float32_max_weird_boxed"
   external min_num : t -> t -> t = "float32_min_num_boxed"
   external max_num : t -> t -> t = "float32_max_num_boxed"
   external min_max : t -> t -> t * t = "float32_min_max_boxed"
   external min_max_num : t -> t -> t * t = "float32_min_max_num_boxed"
 
+  external round_current : t -> t = "float32_round_current_boxed"
+  external iround_current : t -> int64 = "float32_iround_current_boxed"
+
   external compare : t -> t -> int = "float32_compare_boxed" [@@noalloc]
   let equal x y = compare x y = 0
 
@@ -222,6 +227,8 @@ let () =
     bit_eq (F32.copy_sign u1 u2) (CF32.copy_sign f1 f2);
     bit_eq (F32.min u1 u2) (CF32.min f1 f2);
     bit_eq (F32.max u1 u2) (CF32.max f1 f2);
+    bit_eq (F32.With_weird_nan_behavior.min u1 u2) (CF32.min_weird f1 f2);
+    bit_eq (F32.With_weird_nan_behavior.max u1 u2) (CF32.max_weird f1 f2);
     bit_eq (F32.min_num u1 u2) (CF32.min_num f1 f2);
     bit_eq (F32.max_num u1 u2) (CF32.max_num f1 f2);
     assert((F32.compare u1 u2) = (CF32.compare f1 f2));
@@ -229,6 +236,17 @@ let () =
   )
 ;;
 
+let () =
+  CF32.check_float32s (fun f _ ->
+    let u = F32.of_float32 f in
+    bit_eq (F32.round_up u) (CF32.ceil f);
+    bit_eq (F32.round_down u) (CF32.floor f);
+    bit_eq (F32.round_half_to_even u) (CF32.round_current f);
+    (* Returns int64, so can compare directly. *)
+    assert (box_int64 (F32.iround_half_to_even u) = (CF32.iround_current f));
+  )
+;;
+
 let () =
   CF32.check_float32s (fun f _ ->
     let u = F32.of_float32 f in
diff --git a/tests/small_numbers/stubs.c b/tests/small_numbers/stubs.c
index be933a47efb..17c2db69709 100644
--- a/tests/small_numbers/stubs.c
+++ b/tests/small_numbers/stubs.c
@@ -192,6 +192,20 @@ value float32_max_boxed(value l, value r)
     return caml_copy_float32(fmaxf(f, g));
 }
 
+value float32_min_weird_boxed(value l, value r)
+{
+    float f = Float32_val(l);
+    float g = Float32_val(r);
+    return caml_copy_float32(f < g ? f : g);
+}
+
+value float32_max_weird_boxed(value l, value r)
+{
+    float f = Float32_val(l);
+    float g = Float32_val(r);
+    return caml_copy_float32(f > g ? f : g);
+}
+
 value float32_min_num_boxed(value l, value r)
 {
     float f = Float32_val(l);
@@ -312,3 +326,13 @@ value float32_classify_boxed(value f)
 {
     return float32_classify(Float32_val(f));
 }
+
+value float32_round_current_boxed(value f)
+{
+    return caml_copy_float32(rintf(Float32_val(f)));
+}
+
+value float32_iround_current_boxed(value f)
+{
+    return caml_copy_int64(llrintf(Float32_val(f)));
+}