ocaml-flambda · TheNumbat · May 9, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/backend/amd64/arch.ml b/backend/amd64/arch.ml
@@ -139,14 +139,18 @@ type prefetch_info = {
 
 type bswap_bitwidth = Sixteen | Thirtytwo | Sixtyfour
 
+type float_width = Cmm.float_width
+
 type specific_operation =
-    Ilea of addressing_mode             (* "lea" gives scaled adds *)
+    Ilea of addressing_mode            (* "lea" gives scaled adds *)
   | Istore_int of nativeint * addressing_mode * bool
-                                        (* Store an integer constant *)
-  | Ioffset_loc of int * addressing_mode (* Add a constant to a location *)
-  | Ifloatarithmem of float_operation * addressing_mode
+                                       (* Store an integer constant *)
+  | Ioffset_loc of int * addressing_mode
+                                       (* Add a constant to a location *)
+  | Ifloatarithmem of float_width * float_operation * addressing_mode
                                        (* Float arith operation with memory *)
-  | Ifloatsqrtf of addressing_mode     (* Float square root from memory *)
+  | Ifloatsqrtf of float_width * addressing_mode
+                                       (* Float square root from memory *)
   | Ibswap of { bitwidth: bswap_bitwidth; } (* endianness conversion *)
   | Isextend32                         (* 32 to 64 bit conversion with sign
                                           extension *)
@@ -166,7 +170,10 @@ type specific_operation =
       }
 
 and float_operation =
-    Ifloatadd | Ifloatsub | Ifloatmul | Ifloatdiv
+  | Ifloatadd
+  | Ifloatsub
+  | Ifloatmul
+  | Ifloatdiv
 
 (* Sizes, endianness *)
 
@@ -244,16 +251,23 @@ let print_specific_operation printreg op ppf arg =
          (if is_assign then "(assign)" else "(init)")
   | Ioffset_loc(n, addr) ->
       fprintf ppf "[%a] +:= %i" (print_addressing printreg addr) arg n
-  | Ifloatsqrtf addr ->
+  | Ifloatsqrtf (Float64, addr) ->
      fprintf ppf "sqrtf float64[%a]"
              (print_addressing printreg addr) [|arg.(0)|]
-  | Ifloatarithmem(op, addr) ->
-      let op_name = function
-      | Ifloatadd -> "+f"
-      | Ifloatsub -> "-f"
-      | Ifloatmul -> "*f"
-      | Ifloatdiv -> "/f" in
-      fprintf ppf "%a %s float64[%a]" printreg arg.(0) (op_name op)
+  | Ifloatsqrtf (Float32, addr) ->
+     fprintf ppf "sqrtf float32[%a]"
+             (print_addressing printreg addr) [|arg.(0)|]
+  | Ifloatarithmem(width, op, addr) ->
+      let op_name = match width, op with
+      | Float64, Ifloatadd -> "+f"
+      | Float64, Ifloatsub -> "-f"
+      | Float64, Ifloatmul -> "*f"
+      | Float64, Ifloatdiv -> "/f"
+      | Float32, Ifloatadd -> "+f32"
+      | Float32, Ifloatsub -> "-f32"
+      | Float32, Ifloatmul -> "*f32"
+      | Float32, Ifloatdiv -> "/f32" in
+      fprintf ppf "%a %s float64[%a]" printreg arg.(0) op_name
                    (print_addressing printreg addr)
                    (Array.sub arg 1 (Array.length arg - 1))
   | Ibswap { bitwidth } ->
@@ -360,9 +374,9 @@ let equal_prefetch_temporal_locality_hint left right =
 
 let equal_float_operation left right =
   match left, right with
-  | Ifloatadd, Ifloatadd -> true
-  | Ifloatsub, Ifloatsub -> true
-  | Ifloatmul, Ifloatmul -> true
+  | Ifloatadd, Ifloatadd
+  | Ifloatsub, Ifloatsub
+  | Ifloatmul, Ifloatmul
   | Ifloatdiv, Ifloatdiv -> true
   | (Ifloatadd | Ifloatsub | Ifloatmul | Ifloatdiv), _ -> false
 
@@ -373,11 +387,14 @@ let equal_specific_operation left right =
     Nativeint.equal x y && equal_addressing_mode x' y' && Bool.equal x'' y''
   | Ioffset_loc (x, x'), Ioffset_loc (y, y') ->
     Int.equal x y && equal_addressing_mode x' y'
-  | Ifloatarithmem (x, x'), Ifloatarithmem (y, y') ->
-    equal_float_operation x y && equal_addressing_mode x' y'
+  | Ifloatarithmem (xw, x, x'), Ifloatarithmem (yw, y, y') ->
+    Cmm.equal_float_width xw yw &&
+    equal_float_operation x y &&
+    equal_addressing_mode x' y'
   | Ibswap { bitwidth = left }, Ibswap { bitwidth = right } ->
     Int.equal (int_of_bswap_bitwidth left) (int_of_bswap_bitwidth right)
-  | Ifloatsqrtf left, Ifloatsqrtf right ->
+  | Ifloatsqrtf (left_w, left), Ifloatsqrtf (right_w, right) ->
+    Cmm.equal_float_width left_w right_w &&
     equal_addressing_mode left right
   | Isextend32, Isextend32 ->
     true

diff --git a/backend/amd64/arch.mli b/backend/amd64/arch.mli
@@ -68,14 +68,17 @@ type prefetch_info = {
 
 type bswap_bitwidth = Sixteen | Thirtytwo | Sixtyfour
 
+type float_width = Cmm.float_width
+
 type specific_operation =
     Ilea of addressing_mode             (* "lea" gives scaled adds *)
   | Istore_int of nativeint * addressing_mode * bool
                                         (* Store an integer constant *)
   | Ioffset_loc of int * addressing_mode (* Add a constant to a location *)
-  | Ifloatarithmem of float_operation * addressing_mode
+  | Ifloatarithmem of float_width * float_operation * addressing_mode
                                        (* Float arith operation with memory *)
-  | Ifloatsqrtf of addressing_mode     (* Float square root from memory *)
+  | Ifloatsqrtf of float_width * addressing_mode
+                                       (* Float square root from memory *)
   | Ibswap of { bitwidth: bswap_bitwidth; } (* endianness conversion *)
   | Isextend32                         (* 32 to 64 bit conversion with sign
                                           extension *)
@@ -95,7 +98,10 @@ type specific_operation =
       }
 
 and float_operation =
-    Ifloatadd | Ifloatsub | Ifloatmul | Ifloatdiv
+  | Ifloatadd
+  | Ifloatsub
+  | Ifloatmul
+  | Ifloatdiv
 
 val equal_specific_operation : specific_operation -> specific_operation -> bool
 

diff --git a/backend/amd64/emit.mlp b/backend/amd64/emit.mlp
@@ -570,18 +570,28 @@ let instr_for_intop = function
   | Iasr -> I.sar
   | _ -> assert false
 
-let instr_for_floatop = function
-  | Iaddf -> I.addsd
-  | Isubf -> I.subsd
-  | Imulf -> I.mulsd
-  | Idivf -> I.divsd
+let instr_for_floatop width op =
+  match width, op with
+  | Float64, Iaddf -> I.addsd
+  | Float64, Isubf -> I.subsd
+  | Float64, Imulf -> I.mulsd
+  | Float64, Idivf -> I.divsd
+  | Float32, Iaddf -> I.addss
+  | Float32, Isubf -> I.subss
+  | Float32, Imulf -> I.mulss
+  | Float32, Idivf -> I.divss
   | _ -> assert false
 
-let instr_for_floatarithmem = function
-  | Ifloatadd -> I.addsd
-  | Ifloatsub -> I.subsd
-  | Ifloatmul -> I.mulsd
-  | Ifloatdiv -> I.divsd
+let instr_for_floatarithmem width op =
+  match width, op with
+  | Float64, Ifloatadd -> I.addsd
+  | Float64, Ifloatsub -> I.subsd
+  | Float64, Ifloatmul -> I.mulsd
+  | Float64, Ifloatdiv -> I.divsd
+  | Float32, Ifloatadd -> I.addss
+  | Float32, Ifloatsub -> I.subss
+  | Float32, Ifloatmul -> I.mulss
+  | Float32, Ifloatdiv -> I.divss
 
 let cond = function
   | Isigned Ceq   -> E   | Isigned Cne   -> NE
@@ -600,7 +610,8 @@ let output_test_zero arg =
 
 (* Output a floating-point compare and branch *)
 
-let emit_float_test cmp i ~(taken:X86_ast.condition -> unit) =
+let emit_float_test (width : Cmm.float_width)
+                    cmp i ~(taken:X86_ast.condition -> unit) =
   (* Effect of comisd on flags and conditional branches:
                      ZF PF CF  cond. branches taken
         unordered     1  1  1  je, jb, jbe, jp
@@ -610,46 +621,51 @@ let emit_float_test cmp i ~(taken:X86_ast.condition -> unit) =
      If FP traps are on (they are off by default),
      comisd traps on QNaN and SNaN but ucomisd traps on SNaN only.
   *)
+  let ucomi, comi =
+    match width with
+    | Float64 -> I.ucomisd, I.comisd
+    | Float32 -> I.ucomiss, I.comiss
+  in
   match cmp with
   | CFeq when arg i 1 = arg i 0 ->
-      I.ucomisd (arg i 1) (arg i 0);
+      ucomi (arg i 1) (arg i 0);
       taken NP
   | CFeq ->
       let next = new_label() in
-      I.ucomisd (arg i 1) (arg i 0);
+      ucomi (arg i 1) (arg i 0);
       I.jp (label next);           (* skip if unordered *)
       taken E;                     (* branch taken if x=y *)
       def_label next
   | CFneq when arg i 1 = arg i 0 ->
-      I.ucomisd (arg i 1) (arg i 0);
+      ucomi (arg i 1) (arg i 0);
       taken P
   | CFneq ->
-      I.ucomisd (arg i 1) (arg i 0);
+      ucomi (arg i 1) (arg i 0);
       taken P;                     (* branch taken if unordered *)
       taken NE                     (* branch taken if x<y or x>y *)
   | CFlt ->
-      I.comisd (arg i 0) (arg i 1);
+      comi (arg i 0) (arg i 1);
       taken A                      (* branch taken if y>x i.e. x<y *)
   | CFnlt ->
-      I.comisd (arg i 0) (arg i 1);
+      comi (arg i 0) (arg i 1);
       taken BE                     (* taken if unordered or y<=x i.e. !(x<y) *)
   | CFle ->
-      I.comisd (arg i 0) (arg i 1);(* swap compare *)
+      comi (arg i 0) (arg i 1);    (* swap compare *)
       taken AE                     (* branch taken if y>=x i.e. x<=y *)
   | CFnle ->
-      I.comisd (arg i 0) (arg i 1);(* swap compare *)
+      comi (arg i 0) (arg i 1);    (* swap compare *)
       taken B                      (* taken if unordered or y<x i.e. !(x<=y) *)
   | CFgt ->
-      I.comisd (arg i 1) (arg i 0);
+      comi (arg i 1) (arg i 0);
       taken A                      (* branch taken if x>y *)
   | CFngt ->
-      I.comisd (arg i 1) (arg i 0);
+      comi (arg i 1) (arg i 0);
       taken BE                     (* taken if unordered or x<=y i.e. !(x>y) *)
   | CFge ->
-      I.comisd (arg i 1) (arg i 0);(* swap compare *)
+      comi (arg i 1) (arg i 0);    (* swap compare *)
       taken AE                     (* branch taken if x>=y *)
   | CFnge ->
-      I.comisd (arg i 1) (arg i 0);(* swap compare *)
+      comi (arg i 1) (arg i 0);    (* swap compare *)
       taken B                      (* taken if unordered or x<y i.e. !(x>=y) *)
 
 let emit_test i ~(taken:X86_ast.condition -> unit) = function
@@ -669,8 +685,8 @@ let emit_test i ~(taken:X86_ast.condition -> unit) = function
   | Iinttest_imm(cmp, n) ->
     I.cmp (int n) (arg i 0);
     taken (cond cmp)
-  | Ifloattest cmp ->
-    emit_float_test cmp i ~taken
+  | Ifloattest (width, cmp) ->
+    emit_float_test width cmp i ~taken
   | Ioddtest ->
     I.test (int 1) (arg8 i 0);
     taken NE
@@ -1513,18 +1529,31 @@ let emit_instr ~first ~fallthrough i =
       instr_for_intop op (int n) (res i 0)
   | Lop(Iintop_atomic{op; size; addr}) ->
       emit_atomic i op size addr
-  | Lop(Ifloatop(Icompf cmp)) ->
+  | Lop(Ifloatop(Float64, Icompf cmp)) ->
       let cond, need_swap = float_cond_and_need_swap cmp in
       let a0, a1 = if need_swap then arg i 1, arg i 0 else arg i 0, arg i 1 in
       I.cmpsd cond a1 a0;
       I.movq a0 (res i 0);
       I.neg (res i 0)
-  | Lop(Ifloatop(Inegf)) ->
+  | Lop(Ifloatop(Float32, Icompf cmp)) ->
+      let cond, need_swap = float_cond_and_need_swap cmp in
+      let a0, a1 = if need_swap then arg i 1, arg i 0 else arg i 0, arg i 1 in
+      I.cmpss cond a1 a0;
+      I.movd a0 (res32 i 0);
+      (* CMPSS only sets the bottom 32 bits of the result, so we sign-extend to
+         copy the result to the top 32 bits. *)
+      I.movsxd (res32 i 0) (res i 0);
+      I.neg (res i 0)
+  | Lop(Ifloatop(Float64, Inegf)) ->
       I.xorpd (mem64_rip VEC128 (emit_symbol "caml_negf_mask")) (res i 0)
-  | Lop(Ifloatop(Iabsf)) ->
+  | Lop(Ifloatop(Float64, Iabsf)) ->
       I.andpd (mem64_rip VEC128 (emit_symbol "caml_absf_mask")) (res i 0)
-  | Lop(Ifloatop(Iaddf | Isubf | Imulf | Idivf as floatop)) ->
-      instr_for_floatop floatop (arg i 1) (res i 0)
+  | Lop(Ifloatop(Float32, Inegf)) ->
+      I.xorps (mem64_rip VEC128 (emit_symbol "caml_negf32_mask")) (res i 0)
+  | Lop(Ifloatop(Float32, Iabsf)) ->
+      I.andps (mem64_rip VEC128 (emit_symbol "caml_absf32_mask")) (res i 0)
+  | Lop(Ifloatop(width, (Iaddf | Isubf | Imulf | Idivf as floatop))) ->
+      instr_for_floatop width floatop (arg i 1) (res i 0)
   | Lop(Iintofvalue | Ivalueofint | Ivectorcast Bits128) ->
       move i.arg.(0) i.res.(0)
   | Lop(Iscalarcast (Float_of_int Float64)) ->
@@ -1579,18 +1608,23 @@ let emit_instr ~first ~fallthrough i =
       I.mov (nat n) (addressing addr QWORD i 0)
   | Lop(Ispecific(Ioffset_loc(n, addr))) ->
       I.add (int n) (addressing addr QWORD i 0)
-  | Lop(Ispecific(Ifloatarithmem(op, addr))) ->
-      instr_for_floatarithmem op (addressing addr REAL8 i 1) (res i 0)
+  | Lop(Ispecific(Ifloatarithmem(Float64, op, addr))) ->
+      instr_for_floatarithmem Float64 op (addressing addr REAL8 i 1) (res i 0)
+  | Lop(Ispecific(Ifloatarithmem(Float32, op, addr))) ->
+      instr_for_floatarithmem Float32 op (addressing addr REAL4 i 1) (res i 0)
   | Lop(Ispecific(Ibswap { bitwidth = Sixteen })) ->
       I.xchg ah al;
       I.movzx (res16 i 0) (res i 0)
   | Lop(Ispecific(Ibswap { bitwidth = Thirtytwo })) ->
       I.bswap (res32 i 0);
   | Lop(Ispecific(Ibswap { bitwidth = Sixtyfour })) ->
       I.bswap (res i 0)
-  | Lop(Ispecific(Ifloatsqrtf addr)) ->
+  | Lop(Ispecific(Ifloatsqrtf (Float64, addr))) ->
       I.xorpd (res i 0) (res i 0); (* avoid partial register stall *)
       I.sqrtsd (addressing addr REAL8 i 0) (res i 0)
+  | Lop(Ispecific(Ifloatsqrtf (Float32, _addr))) ->
+      (* CR mslater: (float32) Ifloatsqrtf Float32 *)
+      Misc.fatal_error "Ifloatsqrtf Float32 should never be generated."
   | Lop(Ispecific(Isextend32)) ->
       I.movsxd (arg32 i 0) (res i 0)
   | Lop(Ispecific(Izextend32)) ->
@@ -2012,6 +2046,13 @@ let begin_assembly unix =
     _label (emit_symbol "caml_absf_mask");
     D.qword (Const 0x7FFFFFFFFFFFFFFFL);
     D.qword (Const 0xFFFFFFFFFFFFFFFFL);
+    _label (emit_symbol "caml_negf32_mask");
+    D.qword (Const 0x80000000L);
+    D.qword (Const 0L);
+    D.align ~data:true 16;
+    _label (emit_symbol "caml_absf32_mask");
+    D.qword (Const 0xFFFFFFFF7FFFFFFFL);
+    D.qword (Const 0xFFFFFFFFFFFFFFFFL);
   end;
 
   D.data ();

diff --git a/backend/amd64/proc.ml b/backend/amd64/proc.ml
@@ -488,7 +488,7 @@ let destroyed_at_oper = function
   | Iop(Ispecific(Isextend32 | Izextend32 | Ilea _
                  | Istore_int (_, _, _) | Ioffset_loc (_, _)
                  | Ipause | Iprefetch _
-                 | Ifloatarithmem (_, _) | Ifloatsqrtf _ | Ibswap _))
+                 | Ifloatarithmem (_, _, _) | Ifloatsqrtf (_, _) | Ibswap _))
   | Iop(Iintop(Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
               | Ipopcnt | Iclz _ | Ictz _ ))
   | Iop(Iintop_imm((Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl
@@ -662,7 +662,8 @@ let max_register_pressure =
     consumes ~int:(1 + num_destroyed_by_plt_stub) ~float:0
   | Iintop(Icomp _) | Iintop_imm((Icomp _), _) ->
     consumes ~int:1 ~float:0
-  | Istore(Single { reg = Float64 }, _, _) | Ifloatop (Icompf _) ->
+  | Istore(Single { reg = Float64 }, _, _)
+  | Ifloatop ((Float64 | Float32), Icompf _) ->
     consumes ~int:0 ~float:1
   | Ispecific(Isimd op) ->
     (match Simd_proc.register_behavior op with
@@ -688,7 +689,8 @@ let max_register_pressure =
             | Single { reg = Float32 } | Double
             | Onetwentyeight_aligned | Onetwentyeight_unaligned),
             _, _)
-  | Imove | Ispill | Ireload | Ifloatop (Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf)
+  | Imove | Ispill | Ireload
+  | Ifloatop ((Float64 | Float32), (Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf))
   | Icsel _
   | Ivalueofint | Iintofvalue | Ivectorcast _ | Iscalarcast _
   | Iconst_int _ | Iconst_float _ | Iconst_float32 _
@@ -698,7 +700,8 @@ let max_register_pressure =
   | Ispecific(Ilea _ | Isextend32 | Izextend32 | Iprefetch _ | Ipause
              | Irdtsc | Irdpmc | Istore_int (_, _, _)
              | Ilfence | Isfence | Imfence
-             | Ioffset_loc (_, _) | Ifloatarithmem (_, _) | Ifloatsqrtf _
+             | Ioffset_loc (_, _) | Ifloatarithmem (_, _, _)
+             | Ifloatsqrtf (_, _)
              | Ibswap _)
   | Iname_for_debugger _ | Iprobe _ | Iprobe_is_enabled _ | Iopaque
   | Ibeginregion | Iendregion | Idls_get
@@ -794,7 +797,7 @@ let operation_supported = function
   | Cbswap _
   | Cclz _ | Cctz _
   | Ccmpi _ | Caddv | Cadda | Ccmpa _
-  | Cnegf | Cabsf | Caddf | Csubf | Cmulf | Cdivf
+  | Cnegf _ | Cabsf _ | Caddf _ | Csubf _ | Cmulf _ | Cdivf _
   | Cvalueofint | Cintofvalue
   | Ccmpf _
   | Craise _