ocaml-flambda · TheNumbat · Jul 9, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 18, 2024
diff --git a/backend/amd64/arch.ml b/backend/amd64/arch.ml
@@ -116,7 +116,7 @@ let assert_simd_enabled () =
   (extension_universe stable) in your library configuration file."
 
 let assert_float32_enabled () =
-  if not (Language_extension.is_enabled Small_numbers) then
+  if not (Language_extension.(is_at_least Small_numbers Stable)) then
   Misc.fatal_error "float32 is not enabled. This error might happen \
   if you are using float32 yourself or are linking code that uses it. \
   Pass [-extension-universe beta] to the compiler, or set \

diff --git a/backend/amd64/emit.mlp b/backend/amd64/emit.mlp
@@ -1016,12 +1016,8 @@ let emit_static_cast (cast : Cmm.static_cast) i =
     I.movq (arg i 0) (res i 0)
   | Scalar_of_v128 Int32x4 -> I.movd (arg i 0) (res32 i 0)
   | V128_of_scalar Int32x4 -> I.movd (arg32 i 0) (res i 0)
-  | V128_of_scalar Float32x4 ->
-    (* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
-    I.cvtsd2ss (arg i 0) (res i 0)
-  | Scalar_of_v128 Float32x4 ->
-    (* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
-    I.cvtss2sd (arg i 0) (res i 0)
+  | V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4 ->
+    if distinct then I.movss (arg i 0) (res i 0)
   | Scalar_of_v128 Int16x8 ->
     (* [movw] and [movzx] cannot operate on vector registers.
        We must zero extend as the result is an untagged positive int.

diff --git a/backend/amd64/proc.ml b/backend/amd64/proc.ml
@@ -166,8 +166,8 @@ let hard_float32_reg =
   for i = 0 to 15 do v.(i) <- Reg.at_location Float32 (Reg (100 + i)) done;
   fun () -> assert_float32_enabled (); v
 
-let extension_regs (type a) ?prefix (ext : a Language_extension.t) () =
-  if not (Language_extension.is_enabled ext) then [||]
+let extension_regs (type a) ?prefix (ext : a Language_extension.t) (maturity : a) () =
+  if not (Language_extension.is_at_least ext maturity) then [||]
   else let regs =
     match ext with
     | SIMD -> hard_vec128_reg ()
@@ -180,9 +180,10 @@ let extension_regs (type a) ?prefix (ext : a Language_extension.t) () =
   | Some p -> Array.sub regs 0 (Int.min p (Array.length regs))
 
 let all_phys_regs =
+  let open Language_extension in
   let basic_regs = Array.append hard_int_reg hard_float_reg in
-  let simd_regs = extension_regs SIMD in
-  let f32_regs = extension_regs Small_numbers in
+  let simd_regs = extension_regs SIMD () in
+  let f32_regs = extension_regs Small_numbers Stable in
   fun () -> Array.append basic_regs
             (Array.append (simd_regs ()) (f32_regs ()))
 
@@ -204,7 +205,7 @@ let rbp = phys_reg Int 12
 let destroy_xmm n =
   let f64 = [| phys_reg Float (100 + n) |] in
   let f32 =
-    if Language_extension.is_enabled Small_numbers
+    if Language_extension.(is_at_least Small_numbers Stable)
     then [| phys_reg Float32 (100 + n) |]
     else [||]
   in
@@ -410,23 +411,25 @@ let int_regs_destroyed_at_c_call =
 
 let destroyed_at_c_call_win64 =
   (* Win64: rbx, rbp, rsi, rdi, r12-r15, xmm6-xmm15 preserved *)
+  let open Language_extension in
   let basic_regs = Array.append
     (Array.map (phys_reg Int) int_regs_destroyed_at_c_call_win64)
     (Array.sub hard_float_reg 0 6)
   in
-  let v128_regs = extension_regs ~prefix:6 SIMD in
-  let f32_regs = extension_regs ~prefix:6 Small_numbers in
+  let v128_regs = extension_regs ~prefix:6 SIMD () in
+  let f32_regs = extension_regs ~prefix:6 Small_numbers Stable in
   fun () -> Array.append basic_regs
               (Array.append (v128_regs ()) (f32_regs ()))
 
 let destroyed_at_c_call_unix =
+  let open Language_extension in
   (* Unix: rbx, rbp, r12-r15 preserved *)
   let basic_regs = Array.append
       (Array.map (phys_reg Int) int_regs_destroyed_at_c_call)
     hard_float_reg
   in
-  let v128_regs = extension_regs SIMD in
-  let f32_regs = extension_regs Small_numbers in
+  let v128_regs = extension_regs SIMD () in
+  let f32_regs = extension_regs Small_numbers Stable in
   fun () -> Array.append basic_regs
               (Array.append (v128_regs ()) (f32_regs ()))
 

diff --git a/backend/amd64/regalloc_stack_operands.ml b/backend/amd64/regalloc_stack_operands.ml
@@ -179,11 +179,9 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
     | R_to_RM -> may_use_stack_operand_for_result map instr ~num_args:1
     | RM_to_R -> may_use_stack_operand_for_only_argument map instr ~has_result:true)
   | Op (Reinterpret_cast (Float_of_float32 | Float32_of_float | V128_of_v128))
-  | Op (Static_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2)) ->
-    unary_operation_argument_or_result_on_stack map instr
+  | Op (Static_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2))
   | Op (Static_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4)) ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
-    may_use_stack_operand_for_only_argument map instr ~has_result:true
+    unary_operation_argument_or_result_on_stack map instr
   | Op (Reinterpret_cast (Float_of_int64 | Float32_of_int32))
   | Op (Static_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16))) ->
     may_use_stack_operand_for_only_argument map instr ~has_result:true

diff --git a/backend/amd64/reload.ml b/backend/amd64/reload.ml
@@ -164,16 +164,13 @@ method! reload_operation op arg res =
     (* Result must be in register, but argument can be on stack *)
     (arg, (if stackp res.(0) then [| self#makereg res.(0) |] else res))
   | Ireinterpret_cast (Float_of_float32 |  Float32_of_float | V128_of_v128)
-  | Istatic_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2) ->
+  | Istatic_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2)
+  | Istatic_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4) ->
     (* These are just moves; either the argument or result may be on the stack. *)
     begin match stackp arg.(0), stackp res.(0) with
     | true, true -> ([| self#makereg arg.(0) |], res)
     | _ -> (arg, res)
     end
-  | Istatic_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4) ->
-    (* These do additional logic requiring the result to be a register.
-       CR mslater: (SIMD) replace once we have unboxed float32 *)
-    (arg, [| self#makereg res.(0) |])
   | Ireinterpret_cast (Float_of_int64 | Float32_of_int32)
   | Istatic_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16)) ->
     (* Int -> Vec regs need the result to be a register. *)

diff --git a/backend/amd64/simd_proc.ml b/backend/amd64/simd_proc.ml
@@ -99,7 +99,7 @@ let register_behavior_sse41 = function
   | Blendv_8 | Blendv_32 | Blendv_64 -> R_RM_xmm0_to_fst
   | Extract_i64 _ | Extract_i32 _ -> R_to_RM
   | Extract_i8 _ | Extract_i16 _ ->
-    (* CR mslater: (SIMD): replace once we have int8/int16/float32 *)
+    (* CR mslater: (SIMD): replace once we have int8/int16 *)
     R_to_R
 
 let register_behavior_sse42 = function

diff --git a/backend/cmm_builtins.ml b/backend/cmm_builtins.ml
@@ -195,30 +195,45 @@ let bigstring_atomic_add size (arg1, arg2, arg3) dbg =
 let bigstring_atomic_sub size (arg1, arg2, arg3) dbg =
   bigstring_atomic_add size (arg1, arg2, neg_int arg3 dbg) dbg
 
-(* Assumes unboxed float64 *)
-let rec const_float_args n args name =
+let rec const_args_gen ~extract ~type_name n args name =
   match n, args with
   | 0, [] -> []
-  | n, Cconst_float (f, _) :: args -> f :: const_float_args (n - 1) args name
-  | _ -> bad_immediate "Did not find constant float arguments for %s" name
+  | _, [] ->
+    bad_immediate "Missing %d constant %s argument(s) for %s" n type_name name
+  | n, arg :: args -> (
+    match extract arg with
+    | Some value ->
+      value :: const_args_gen ~extract ~type_name (n - 1) args name
+    | None ->
+      bad_immediate "Did not find constant %s arguments for %s" type_name name)
+
+(* Assumes unboxed float32 *)
+let const_float32_args =
+  const_args_gen
+    ~extract:(function Cconst_float32 (f, _) -> Some f | _ -> None)
+    ~type_name:"float32"
+
+(* Assumes unboxed float64 *)
+let const_float_args =
+  const_args_gen
+    ~extract:(function Cconst_float (f, _) -> Some f | _ -> None)
+    ~type_name:"float"
 
 (* Assumes untagged int or unboxed int32, always representable by int63 *)
-let rec const_int_args n args name =
-  match n, args with
-  | 0, [] -> []
-  | n, Cconst_int (i, _) :: args -> i :: const_int_args (n - 1) args name
-  | _ -> bad_immediate "Did not find constant int arguments for %s" name
+let const_int_args =
+  const_args_gen
+    ~extract:(function Cconst_int (i, _) -> Some i | _ -> None)
+    ~type_name:"int"
 
 (* Assumes unboxed int64: no tag, comes as Cconst_int when representable by
    int63, otherwise we get Cconst_natint *)
-let rec const_int64_args n args name =
-  match n, args with
-  | 0, [] -> []
-  | n, Cconst_int (i, _) :: args ->
-    Int64.of_int i :: const_int64_args (n - 1) args name
-  | n, Cconst_natint (i, _) :: args ->
-    Int64.of_nativeint i :: const_int64_args (n - 1) args name
-  | _ -> bad_immediate "Did not find constant int64 arguments for %s" name
+let const_int64_args =
+  const_args_gen
+    ~extract:(function
+      | Cconst_int (i, _) -> Some (Int64.of_int i)
+      | Cconst_natint (i, _) -> Some (Int64.of_nativeint i)
+      | _ -> None)
+    ~type_name:"int64"
 
 let int64_of_int8 i =
   (* CR mslater: (SIMD) replace once we have unboxed int8 *)
@@ -238,7 +253,6 @@ let int64_of_int32 i =
   Int64.of_int i |> Int64.logand 0xffffffffL
 
 let int64_of_float32 f =
-  (* CR mslater: (SIMD) replace once we have unboxed float32 *)
   Int32.bits_of_float f |> Int64.of_int32 |> Int64.logand 0xffffffffL
 
 let pack_int32s i0 i1 = Int64.(logor (shift_left i1 32) i0)
@@ -272,12 +286,10 @@ let transl_vec128_builtin name args dbg _typ_res =
   | "caml_float64x2_low_to_float" ->
     let op = Cstatic_cast (Scalar_of_v128 Float64x2) in
     if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
-  | "caml_float32x4_low_of_float" ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
+  | "caml_float32x4_low_of_float32" ->
     let op = Cstatic_cast (V128_of_scalar Float32x4) in
     if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
-  | "caml_float32x4_low_to_float" ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
+  | "caml_float32x4_low_to_float32" ->
     let op = Cstatic_cast (Scalar_of_v128 Float32x4) in
     if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
   | "caml_int64x2_low_of_int64" ->
@@ -310,15 +322,13 @@ let transl_vec128_builtin name args dbg _typ_res =
     if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
   (* Constants *)
   | "caml_float32x4_const1" ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
-    let f = const_float_args 1 args name |> List.hd in
+    let f = const_float32_args 1 args name |> List.hd in
     let i = int64_of_float32 f in
     let i = pack_int32s i i in
     Some (Cconst_vec128 ({ low = i; high = i }, dbg))
   | "caml_float32x4_const4" ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
     let i0, i1, i2, i3 =
-      match const_float_args 4 args name |> List.map int64_of_float32 with
+      match const_float32_args 4 args name |> List.map int64_of_float32 with
       | [i0; i1; i2; i3] -> i0, i1, i2, i3
       | _ -> assert false
     in

diff --git a/backend/cmm_helpers.ml b/backend/cmm_helpers.ml
@@ -1704,6 +1704,7 @@ let curry_function_sym function_kind arity result =
 let bigarray_elt_size_in_bytes : Lambda.bigarray_kind -> int = function
   | Pbigarray_unknown -> assert false
   | Pbigarray_float32 -> 4
+  | Pbigarray_float32_t -> 4
   | Pbigarray_float64 -> 8
   | Pbigarray_sint8 -> 1
   | Pbigarray_uint8 -> 1
@@ -1719,6 +1720,7 @@ let bigarray_elt_size_in_bytes : Lambda.bigarray_kind -> int = function
 let bigarray_word_kind : Lambda.bigarray_kind -> memory_chunk = function
   | Pbigarray_unknown -> assert false
   | Pbigarray_float32 -> Single { reg = Float64 }
+  | Pbigarray_float32_t -> Single { reg = Float32 }
   | Pbigarray_float64 -> Double
   | Pbigarray_sint8 -> Byte_signed
   | Pbigarray_uint8 -> Byte_unsigned
@@ -2173,6 +2175,15 @@ let unaligned_set_64 ptr idx newval dbg =
                     [add_int (add_int ptr idx dbg) (cconst_int 7) dbg; b8],
                     dbg ) ) ) )
 
+let unaligned_load_f32 ptr idx dbg =
+  Cop (mk_load_mut (Single { reg = Float32 }), [add_int ptr idx dbg], dbg)
+
+let unaligned_set_f32 ptr idx newval dbg =
+  Cop
+    ( Cstore (Single { reg = Float32 }, Assignment),
+      [add_int ptr idx dbg; newval],
+      dbg )
+
 let unaligned_load_128 ptr idx dbg =
   assert (size_vec128 = 16);
   Cop (mk_load_mut Onetwentyeight_unaligned, [add_int ptr idx dbg], dbg)

diff --git a/backend/cmm_helpers.mli b/backend/cmm_helpers.mli
@@ -390,6 +390,11 @@ val unaligned_load_32 : expression -> expression -> Debuginfo.t -> expression
 val unaligned_set_32 :
   expression -> expression -> expression -> Debuginfo.t -> expression
 
+val unaligned_load_f32 : expression -> expression -> Debuginfo.t -> expression
+
+val unaligned_set_f32 :
+  expression -> expression -> expression -> Debuginfo.t -> expression
+
 val unaligned_load_64 : expression -> expression -> Debuginfo.t -> expression
 
 val unaligned_set_64 :

diff --git a/backend/selectgen.ml b/backend/selectgen.ml
@@ -186,9 +186,8 @@ let oper_result_type = function
   | Cstatic_cast (Float32_of_float | Float_of_int Float32) -> typ_float32
   | Cstatic_cast (Int_of_float (Float64 | Float32)) -> typ_int
   | Cstatic_cast (V128_of_scalar _) -> typ_vec128
-  | Cstatic_cast (Scalar_of_v128 (Float64x2 | Float32x4)) ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
-    typ_float
+  | Cstatic_cast (Scalar_of_v128 Float64x2) -> typ_float
+  | Cstatic_cast (Scalar_of_v128 Float32x4) -> typ_float32
   | Cstatic_cast (Scalar_of_v128 (Int8x16 | Int16x8 | Int32x4 | Int64x2)) -> typ_int
   | Craise _ -> typ_void
   | Cprobe _ -> typ_void

diff --git a/backend/x86_gas.ml b/backend/x86_gas.ml
@@ -138,7 +138,7 @@ let print_instr b = function
   | CQO ->  i0 b "cqto"
   | CVTSS2SI (arg1, arg2) -> i2 b "cvtss2si" arg1 arg2
   | CVTSD2SI (arg1, arg2) -> i2 b "cvtsd2si" arg1 arg2
-  | CVTSI2SS (arg1, arg2) -> i2 b "cvtsi2ss" arg1 arg2
+  | CVTSI2SS (arg1, arg2) -> i2 b ("cvtsi2ss" ^ suf arg1) arg1 arg2
   | CVTSD2SS (arg1, arg2) -> i2 b "cvtsd2ss" arg1 arg2
   | CVTSI2SD (arg1, arg2) -> i2 b ("cvtsi2sd" ^ suf arg1) arg1 arg2
   | CVTSS2SD (arg1, arg2) -> i2 b "cvtss2sd" arg1 arg2

diff --git a/middle_end/flambda2/from_lambda/closure_conversion.ml b/middle_end/flambda2/from_lambda/closure_conversion.ml
@@ -856,16 +856,18 @@ let close_primitive acc env ~let_bound_ids_with_kinds named
       | Pandbint _ | Porbint _ | Pxorbint _ | Plslbint _ | Plsrbint _
       | Pasrbint _ | Pbintcomp _ | Punboxed_int_comp _ | Pbigarrayref _
       | Pbigarrayset _ | Pbigarraydim _ | Pstring_load_16 _ | Pstring_load_32 _
-      | Pstring_load_64 _ | Pstring_load_128 _ | Pbytes_load_16 _
-      | Pbytes_load_32 _ | Pbytes_load_64 _ | Pbytes_load_128 _
-      | Pbytes_set_16 _ | Pbytes_set_32 _ | Pbytes_set_64 _ | Pbytes_set_128 _
-      | Pbigstring_load_16 _ | Pbigstring_load_32 _ | Pbigstring_load_64 _
-      | Pbigstring_load_128 _ | Pbigstring_set_16 _ | Pbigstring_set_32 _
-      | Pbigstring_set_64 _ | Pbigstring_set_128 _ | Pfloatarray_load_128 _
-      | Pfloat_array_load_128 _ | Pint_array_load_128 _
-      | Punboxed_float_array_load_128 _ | Punboxed_int32_array_load_128 _
-      | Punboxed_int64_array_load_128 _ | Punboxed_nativeint_array_load_128 _
-      | Pfloatarray_set_128 _ | Pfloat_array_set_128 _ | Pint_array_set_128 _
+      | Pstring_load_f32 _ | Pstring_load_64 _ | Pstring_load_128 _
+      | Pbytes_load_16 _ | Pbytes_load_32 _ | Pbytes_load_f32 _
+      | Pbytes_load_64 _ | Pbytes_load_128 _ | Pbytes_set_16 _ | Pbytes_set_32 _
+      | Pbytes_set_f32 _ | Pbytes_set_64 _ | Pbytes_set_128 _
+      | Pbigstring_load_16 _ | Pbigstring_load_32 _ | Pbigstring_load_f32 _
+      | Pbigstring_load_64 _ | Pbigstring_load_128 _ | Pbigstring_set_16 _
+      | Pbigstring_set_32 _ | Pbigstring_set_f32 _ | Pbigstring_set_64 _
+      | Pbigstring_set_128 _ | Pfloatarray_load_128 _ | Pfloat_array_load_128 _
+      | Pint_array_load_128 _ | Punboxed_float_array_load_128 _
+      | Punboxed_int32_array_load_128 _ | Punboxed_int64_array_load_128 _
+      | Punboxed_nativeint_array_load_128 _ | Pfloatarray_set_128 _
+      | Pfloat_array_set_128 _ | Pint_array_set_128 _
       | Punboxed_float_array_set_128 _ | Punboxed_int32_array_set_128 _
       | Punboxed_int64_array_set_128 _ | Punboxed_nativeint_array_set_128 _
       | Pctconst _ | Pbswap16 | Pbbswap _ | Pint_as_pointer _ | Popaque _