Skip to content

Use float32 for float32x4 scalar casts #2710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/amd64/arch.ml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ let assert_simd_enabled () =
(extension_universe stable) in your library configuration file."

let assert_float32_enabled () =
if not (Language_extension.is_enabled Small_numbers) then
if not (Language_extension.(is_at_least Small_numbers Stable)) then
Misc.fatal_error "float32 is not enabled. This error might happen \
if you are using float32 yourself or are linking code that uses it. \
Pass [-extension-universe beta] to the compiler, or set \
Expand Down
8 changes: 2 additions & 6 deletions backend/amd64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -1016,12 +1016,8 @@ let emit_static_cast (cast : Cmm.static_cast) i =
I.movq (arg i 0) (res i 0)
| Scalar_of_v128 Int32x4 -> I.movd (arg i 0) (res32 i 0)
| V128_of_scalar Int32x4 -> I.movd (arg32 i 0) (res i 0)
| V128_of_scalar Float32x4 ->
(* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
I.cvtsd2ss (arg i 0) (res i 0)
| Scalar_of_v128 Float32x4 ->
(* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
I.cvtss2sd (arg i 0) (res i 0)
| V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4 ->
if distinct then I.movss (arg i 0) (res i 0)
| Scalar_of_v128 Int16x8 ->
(* [movw] and [movzx] cannot operate on vector registers.
We must zero extend as the result is an untagged positive int.
Expand Down
21 changes: 12 additions & 9 deletions backend/amd64/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ let hard_float32_reg =
for i = 0 to 15 do v.(i) <- Reg.at_location Float32 (Reg (100 + i)) done;
fun () -> assert_float32_enabled (); v

let extension_regs (type a) ?prefix (ext : a Language_extension.t) () =
if not (Language_extension.is_enabled ext) then [||]
let extension_regs (type a) ?prefix (ext : a Language_extension.t) (maturity : a) () =
if not (Language_extension.is_at_least ext maturity) then [||]
else let regs =
match ext with
| SIMD -> hard_vec128_reg ()
Expand All @@ -180,9 +180,10 @@ let extension_regs (type a) ?prefix (ext : a Language_extension.t) () =
| Some p -> Array.sub regs 0 (Int.min p (Array.length regs))

let all_phys_regs =
let open Language_extension in
let basic_regs = Array.append hard_int_reg hard_float_reg in
let simd_regs = extension_regs SIMD in
let f32_regs = extension_regs Small_numbers in
let simd_regs = extension_regs SIMD () in
let f32_regs = extension_regs Small_numbers Stable in
fun () -> Array.append basic_regs
(Array.append (simd_regs ()) (f32_regs ()))

Expand All @@ -204,7 +205,7 @@ let rbp = phys_reg Int 12
let destroy_xmm n =
let f64 = [| phys_reg Float (100 + n) |] in
let f32 =
if Language_extension.is_enabled Small_numbers
if Language_extension.(is_at_least Small_numbers Stable)
then [| phys_reg Float32 (100 + n) |]
else [||]
in
Expand Down Expand Up @@ -410,23 +411,25 @@ let int_regs_destroyed_at_c_call =

let destroyed_at_c_call_win64 =
(* Win64: rbx, rbp, rsi, rdi, r12-r15, xmm6-xmm15 preserved *)
let open Language_extension in
let basic_regs = Array.append
(Array.map (phys_reg Int) int_regs_destroyed_at_c_call_win64)
(Array.sub hard_float_reg 0 6)
in
let v128_regs = extension_regs ~prefix:6 SIMD in
let f32_regs = extension_regs ~prefix:6 Small_numbers in
let v128_regs = extension_regs ~prefix:6 SIMD () in
let f32_regs = extension_regs ~prefix:6 Small_numbers Stable in
fun () -> Array.append basic_regs
(Array.append (v128_regs ()) (f32_regs ()))

let destroyed_at_c_call_unix =
let open Language_extension in
(* Unix: rbx, rbp, r12-r15 preserved *)
let basic_regs = Array.append
(Array.map (phys_reg Int) int_regs_destroyed_at_c_call)
hard_float_reg
in
let v128_regs = extension_regs SIMD in
let f32_regs = extension_regs Small_numbers in
let v128_regs = extension_regs SIMD () in
let f32_regs = extension_regs Small_numbers Stable in
fun () -> Array.append basic_regs
(Array.append (v128_regs ()) (f32_regs ()))

Expand Down
6 changes: 2 additions & 4 deletions backend/amd64/regalloc_stack_operands.ml
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,9 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
| R_to_RM -> may_use_stack_operand_for_result map instr ~num_args:1
| RM_to_R -> may_use_stack_operand_for_only_argument map instr ~has_result:true)
| Op (Reinterpret_cast (Float_of_float32 | Float32_of_float | V128_of_v128))
| Op (Static_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2)) ->
unary_operation_argument_or_result_on_stack map instr
| Op (Static_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2))
| Op (Static_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4)) ->
(* CR mslater: (SIMD) replace once we have unboxed float32 *)
may_use_stack_operand_for_only_argument map instr ~has_result:true
unary_operation_argument_or_result_on_stack map instr
| Op (Reinterpret_cast (Float_of_int64 | Float32_of_int32))
| Op (Static_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16))) ->
may_use_stack_operand_for_only_argument map instr ~has_result:true
Expand Down
7 changes: 2 additions & 5 deletions backend/amd64/reload.ml
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,13 @@ method! reload_operation op arg res =
(* Result must be in register, but argument can be on stack *)
(arg, (if stackp res.(0) then [| self#makereg res.(0) |] else res))
| Ireinterpret_cast (Float_of_float32 | Float32_of_float | V128_of_v128)
| Istatic_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2) ->
| Istatic_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2)
| Istatic_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4) ->
(* These are just moves; either the argument or result may be on the stack. *)
begin match stackp arg.(0), stackp res.(0) with
| true, true -> ([| self#makereg arg.(0) |], res)
| _ -> (arg, res)
end
| Istatic_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4) ->
(* These do additional logic requiring the result to be a register.
CR mslater: (SIMD) replace once we have unboxed float32 *)
(arg, [| self#makereg res.(0) |])
| Ireinterpret_cast (Float_of_int64 | Float32_of_int32)
| Istatic_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16)) ->
(* Int -> Vec regs need the result to be a register. *)
Expand Down
2 changes: 1 addition & 1 deletion backend/amd64/simd_proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ let register_behavior_sse41 = function
| Blendv_8 | Blendv_32 | Blendv_64 -> R_RM_xmm0_to_fst
| Extract_i64 _ | Extract_i32 _ -> R_to_RM
| Extract_i8 _ | Extract_i16 _ ->
(* CR mslater: (SIMD): replace once we have int8/int16/float32 *)
(* CR mslater: (SIMD): replace once we have int8/int16 *)
R_to_R

let register_behavior_sse42 = function
Expand Down
62 changes: 36 additions & 26 deletions backend/cmm_builtins.ml
Original file line number Diff line number Diff line change
Expand Up @@ -195,30 +195,45 @@ let bigstring_atomic_add size (arg1, arg2, arg3) dbg =
let bigstring_atomic_sub size (arg1, arg2, arg3) dbg =
bigstring_atomic_add size (arg1, arg2, neg_int arg3 dbg) dbg

(* Assumes unboxed float64 *)
let rec const_float_args n args name =
let rec const_args_gen ~extract ~type_name n args name =
match n, args with
| 0, [] -> []
| n, Cconst_float (f, _) :: args -> f :: const_float_args (n - 1) args name
| _ -> bad_immediate "Did not find constant float arguments for %s" name
| _, [] ->
bad_immediate "Missing %d constant %s argument(s) for %s" n type_name name
| n, arg :: args -> (
match extract arg with
| Some value ->
value :: const_args_gen ~extract ~type_name (n - 1) args name
| None ->
bad_immediate "Did not find constant %s arguments for %s" type_name name)

(* Assumes unboxed float32 *)
let const_float32_args =
const_args_gen
~extract:(function Cconst_float32 (f, _) -> Some f | _ -> None)
~type_name:"float32"

(* Assumes unboxed float64 *)
let const_float_args =
const_args_gen
~extract:(function Cconst_float (f, _) -> Some f | _ -> None)
~type_name:"float"

(* Assumes untagged int or unboxed int32, always representable by int63 *)
let rec const_int_args n args name =
match n, args with
| 0, [] -> []
| n, Cconst_int (i, _) :: args -> i :: const_int_args (n - 1) args name
| _ -> bad_immediate "Did not find constant int arguments for %s" name
let const_int_args =
const_args_gen
~extract:(function Cconst_int (i, _) -> Some i | _ -> None)
~type_name:"int"

(* Assumes unboxed int64: no tag, comes as Cconst_int when representable by
int63, otherwise we get Cconst_natint *)
let rec const_int64_args n args name =
match n, args with
| 0, [] -> []
| n, Cconst_int (i, _) :: args ->
Int64.of_int i :: const_int64_args (n - 1) args name
| n, Cconst_natint (i, _) :: args ->
Int64.of_nativeint i :: const_int64_args (n - 1) args name
| _ -> bad_immediate "Did not find constant int64 arguments for %s" name
let const_int64_args =
const_args_gen
~extract:(function
| Cconst_int (i, _) -> Some (Int64.of_int i)
| Cconst_natint (i, _) -> Some (Int64.of_nativeint i)
| _ -> None)
~type_name:"int64"

let int64_of_int8 i =
(* CR mslater: (SIMD) replace once we have unboxed int8 *)
Expand All @@ -238,7 +253,6 @@ let int64_of_int32 i =
Int64.of_int i |> Int64.logand 0xffffffffL

let int64_of_float32 f =
(* CR mslater: (SIMD) replace once we have unboxed float32 *)
Int32.bits_of_float f |> Int64.of_int32 |> Int64.logand 0xffffffffL

let pack_int32s i0 i1 = Int64.(logor (shift_left i1 32) i0)
Expand Down Expand Up @@ -272,12 +286,10 @@ let transl_vec128_builtin name args dbg _typ_res =
| "caml_float64x2_low_to_float" ->
let op = Cstatic_cast (Scalar_of_v128 Float64x2) in
if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
| "caml_float32x4_low_of_float" ->
(* CR mslater: (SIMD) replace once we have unboxed float32 *)
| "caml_float32x4_low_of_float32" ->
let op = Cstatic_cast (V128_of_scalar Float32x4) in
if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
| "caml_float32x4_low_to_float" ->
(* CR mslater: (SIMD) replace once we have unboxed float32 *)
| "caml_float32x4_low_to_float32" ->
let op = Cstatic_cast (Scalar_of_v128 Float32x4) in
if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
| "caml_int64x2_low_of_int64" ->
Expand Down Expand Up @@ -310,15 +322,13 @@ let transl_vec128_builtin name args dbg _typ_res =
if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
(* Constants *)
| "caml_float32x4_const1" ->
(* CR mslater: (SIMD) replace once we have unboxed float32 *)
let f = const_float_args 1 args name |> List.hd in
let f = const_float32_args 1 args name |> List.hd in
let i = int64_of_float32 f in
let i = pack_int32s i i in
Some (Cconst_vec128 ({ low = i; high = i }, dbg))
| "caml_float32x4_const4" ->
(* CR mslater: (SIMD) replace once we have unboxed float32 *)
let i0, i1, i2, i3 =
match const_float_args 4 args name |> List.map int64_of_float32 with
match const_float32_args 4 args name |> List.map int64_of_float32 with
| [i0; i1; i2; i3] -> i0, i1, i2, i3
| _ -> assert false
in
Expand Down
11 changes: 11 additions & 0 deletions backend/cmm_helpers.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1704,6 +1704,7 @@ let curry_function_sym function_kind arity result =
let bigarray_elt_size_in_bytes : Lambda.bigarray_kind -> int = function
| Pbigarray_unknown -> assert false
| Pbigarray_float32 -> 4
| Pbigarray_float32_t -> 4
| Pbigarray_float64 -> 8
| Pbigarray_sint8 -> 1
| Pbigarray_uint8 -> 1
Expand All @@ -1719,6 +1720,7 @@ let bigarray_elt_size_in_bytes : Lambda.bigarray_kind -> int = function
let bigarray_word_kind : Lambda.bigarray_kind -> memory_chunk = function
| Pbigarray_unknown -> assert false
| Pbigarray_float32 -> Single { reg = Float64 }
| Pbigarray_float32_t -> Single { reg = Float32 }
| Pbigarray_float64 -> Double
| Pbigarray_sint8 -> Byte_signed
| Pbigarray_uint8 -> Byte_unsigned
Expand Down Expand Up @@ -2173,6 +2175,15 @@ let unaligned_set_64 ptr idx newval dbg =
[add_int (add_int ptr idx dbg) (cconst_int 7) dbg; b8],
dbg ) ) ) )

let unaligned_load_f32 ptr idx dbg =
Cop (mk_load_mut (Single { reg = Float32 }), [add_int ptr idx dbg], dbg)

let unaligned_set_f32 ptr idx newval dbg =
Cop
( Cstore (Single { reg = Float32 }, Assignment),
[add_int ptr idx dbg; newval],
dbg )

let unaligned_load_128 ptr idx dbg =
assert (size_vec128 = 16);
Cop (mk_load_mut Onetwentyeight_unaligned, [add_int ptr idx dbg], dbg)
Expand Down
5 changes: 5 additions & 0 deletions backend/cmm_helpers.mli
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,11 @@ val unaligned_load_32 : expression -> expression -> Debuginfo.t -> expression
val unaligned_set_32 :
expression -> expression -> expression -> Debuginfo.t -> expression

val unaligned_load_f32 : expression -> expression -> Debuginfo.t -> expression

val unaligned_set_f32 :
expression -> expression -> expression -> Debuginfo.t -> expression

val unaligned_load_64 : expression -> expression -> Debuginfo.t -> expression

val unaligned_set_64 :
Expand Down
5 changes: 2 additions & 3 deletions backend/selectgen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,8 @@ let oper_result_type = function
| Cstatic_cast (Float32_of_float | Float_of_int Float32) -> typ_float32
| Cstatic_cast (Int_of_float (Float64 | Float32)) -> typ_int
| Cstatic_cast (V128_of_scalar _) -> typ_vec128
| Cstatic_cast (Scalar_of_v128 (Float64x2 | Float32x4)) ->
(* CR mslater: (SIMD) replace once we have unboxed float32 *)
typ_float
| Cstatic_cast (Scalar_of_v128 Float64x2) -> typ_float
| Cstatic_cast (Scalar_of_v128 Float32x4) -> typ_float32
| Cstatic_cast (Scalar_of_v128 (Int8x16 | Int16x8 | Int32x4 | Int64x2)) -> typ_int
| Craise _ -> typ_void
| Cprobe _ -> typ_void
Expand Down
2 changes: 1 addition & 1 deletion backend/x86_gas.ml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ let print_instr b = function
| CQO -> i0 b "cqto"
| CVTSS2SI (arg1, arg2) -> i2 b "cvtss2si" arg1 arg2
| CVTSD2SI (arg1, arg2) -> i2 b "cvtsd2si" arg1 arg2
| CVTSI2SS (arg1, arg2) -> i2 b "cvtsi2ss" arg1 arg2
| CVTSI2SS (arg1, arg2) -> i2 b ("cvtsi2ss" ^ suf arg1) arg1 arg2
| CVTSD2SS (arg1, arg2) -> i2 b "cvtsd2ss" arg1 arg2
| CVTSI2SD (arg1, arg2) -> i2 b ("cvtsi2sd" ^ suf arg1) arg1 arg2
| CVTSS2SD (arg1, arg2) -> i2 b "cvtss2sd" arg1 arg2
Expand Down
22 changes: 12 additions & 10 deletions middle_end/flambda2/from_lambda/closure_conversion.ml
Original file line number Diff line number Diff line change
Expand Up @@ -856,16 +856,18 @@ let close_primitive acc env ~let_bound_ids_with_kinds named
| Pandbint _ | Porbint _ | Pxorbint _ | Plslbint _ | Plsrbint _
| Pasrbint _ | Pbintcomp _ | Punboxed_int_comp _ | Pbigarrayref _
| Pbigarrayset _ | Pbigarraydim _ | Pstring_load_16 _ | Pstring_load_32 _
| Pstring_load_64 _ | Pstring_load_128 _ | Pbytes_load_16 _
| Pbytes_load_32 _ | Pbytes_load_64 _ | Pbytes_load_128 _
| Pbytes_set_16 _ | Pbytes_set_32 _ | Pbytes_set_64 _ | Pbytes_set_128 _
| Pbigstring_load_16 _ | Pbigstring_load_32 _ | Pbigstring_load_64 _
| Pbigstring_load_128 _ | Pbigstring_set_16 _ | Pbigstring_set_32 _
| Pbigstring_set_64 _ | Pbigstring_set_128 _ | Pfloatarray_load_128 _
| Pfloat_array_load_128 _ | Pint_array_load_128 _
| Punboxed_float_array_load_128 _ | Punboxed_int32_array_load_128 _
| Punboxed_int64_array_load_128 _ | Punboxed_nativeint_array_load_128 _
| Pfloatarray_set_128 _ | Pfloat_array_set_128 _ | Pint_array_set_128 _
| Pstring_load_f32 _ | Pstring_load_64 _ | Pstring_load_128 _
| Pbytes_load_16 _ | Pbytes_load_32 _ | Pbytes_load_f32 _
| Pbytes_load_64 _ | Pbytes_load_128 _ | Pbytes_set_16 _ | Pbytes_set_32 _
| Pbytes_set_f32 _ | Pbytes_set_64 _ | Pbytes_set_128 _
| Pbigstring_load_16 _ | Pbigstring_load_32 _ | Pbigstring_load_f32 _
| Pbigstring_load_64 _ | Pbigstring_load_128 _ | Pbigstring_set_16 _
| Pbigstring_set_32 _ | Pbigstring_set_f32 _ | Pbigstring_set_64 _
| Pbigstring_set_128 _ | Pfloatarray_load_128 _ | Pfloat_array_load_128 _
| Pint_array_load_128 _ | Punboxed_float_array_load_128 _
| Punboxed_int32_array_load_128 _ | Punboxed_int64_array_load_128 _
| Punboxed_nativeint_array_load_128 _ | Pfloatarray_set_128 _
| Pfloat_array_set_128 _ | Pint_array_set_128 _
| Punboxed_float_array_set_128 _ | Punboxed_int32_array_set_128 _
| Punboxed_int64_array_set_128 _ | Punboxed_nativeint_array_set_128 _
| Pctconst _ | Pbswap16 | Pbbswap _ | Pint_as_pointer _ | Popaque _
Expand Down
Loading
Loading