ocaml-flambda · mshinwell · May 13, 2021 · Apr 28, 2021 · Apr 28, 2021 · Apr 28, 2021
diff --git a/backend/amd64/CSE.ml b/backend/amd64/CSE.ml
@@ -32,6 +32,8 @@ method! class_of_operation op =
     | Ioffset_loc(_, _) -> Op_store true
     | Ifloatarithmem _ | Ifloatsqrtf _ -> Op_load
     | Ibswap _ | Isqrtf -> super#class_of_operation op
+    | Irdtsc | Irdpmc -> Op_other
+    | Icrc32q -> Op_pure
     end
   | _ -> super#class_of_operation op
 

diff --git a/backend/amd64/arch.ml b/backend/amd64/arch.ml
@@ -16,6 +16,9 @@
 (* POPCNT instruction is not available prior to Nehalem, released in 2008. *)
 let popcnt_support = ref true
 
+(* CRC32 requires SSE 4.2 support *)
+let crc32_support = ref true
+
 (* Machine-specific command-line options *)
 
 let command_line_options =
@@ -27,6 +30,10 @@ let command_line_options =
       " Use POPCNT instruction (not available prior to Nehalem)";
     "-fno-popcnt", Arg.Clear popcnt_support,
       " Do not use POPCNT instruction";
+    "-fcrc32", Arg.Set crc32_support,
+      " Use CRC32 instructions (requires SSE4.2 support)";
+    "-fno-crc32", Arg.Clear crc32_support,
+      " Do not emit CRC32 instructions";
   ]
 
 (* Specific operations for the AMD64 processor *)
@@ -54,6 +61,9 @@ type specific_operation =
                                           extension *)
   | Izextend32                         (* 32 to 64 bit conversion with zero
                                           extension *)
+  | Irdtsc                             (* read timestamp *)
+  | Irdpmc                             (* read performance counter *)
+  | Icrc32q                            (* compute crc *)
 
 and float_operation =
     Ifloatadd | Ifloatsub | Ifloatmul | Ifloatdiv
@@ -143,6 +153,12 @@ let print_specific_operation printreg op ppf arg =
       fprintf ppf "sextend32 %a" printreg arg.(0)
   | Izextend32 ->
       fprintf ppf "zextend32 %a" printreg arg.(0)
+  | Irdtsc ->
+      fprintf ppf "rdtsc"
+  | Irdpmc ->
+      fprintf ppf "rdpmc %a" printreg arg.(0)
+  | Icrc32q ->
+      fprintf ppf "crc32 %a %a" printreg arg.(0) printreg arg.(1)
 
 let win64 =
   match Config.system with

diff --git a/backend/amd64/emit.mlp b/backend/amd64/emit.mlp
@@ -958,6 +958,26 @@ let emit_instr fallthrough i =
   | Lop(Iintop Ipopcnt) ->
       assert (!popcnt_support);
       I.popcnt (arg i 0) (res i 0)
+  | Lop(Ispecific Irdtsc) ->
+    assert (reg64 i.res.(0) = RDX);
+    I.rdtsc ();
+    (* The instruction fills in the low 32 bits of the result registers. *)
+    (* Combine edx and eax into a single 64-bit result in rdx. *)
+    I.sal (int 32) (res i 0); (* shift edx to the high part of rdx *)
+    (* On processors that support the Intel 64 architecture,
+       the high-order 32 bits of each of RAX and RDX are cleared. *)
+    I.or_ rax (res i 0) (* combine high and low into rdx *)
+  | Lop(Ispecific Irdpmc) ->
+    assert ((arg64 i 0 = RCX) && (reg64 i.res.(0) = RDX));
+    I.rdpmc ();
+    (* The instruction fills in the low 32 bits of the result registers. *)
+    (* Combine edx and eax into a single 64-bit result in rdx. *)
+    I.sal (int 32) (res i 0); (* shift edx to the high part of rdx *)
+    I.mov eax eax; (* zero-extend eax *)
+    I.or_ rax (res i 0) (* combine high and low into rdx *)
+  | Lop (Ispecific Icrc32q) ->
+    assert (arg i 0 = res i 0);
+    I.crc32 (arg i 1) (res i 0)
   | Lop (Iname_for_debugger _) -> ()
   | Lop (Iprobe _) ->
     let probe_label = new_label () in

diff --git a/backend/amd64/proc.ml b/backend/amd64/proc.ml
@@ -332,6 +332,7 @@ let destroyed_at_oper = function
       [| loc_spacetime_node_hole |]
   | Iswitch(_, _) -> [| rax; rdx |]
   | Itrywith _ -> [| r11 |]
+  | Iop(Ispecific (Irdtsc | Irdpmc)) -> [| rax |]
   | _ ->
     if fp then
 (* prevent any use of the frame pointer ! *)

diff --git a/backend/amd64/reload.ml b/backend/amd64/reload.ml
@@ -46,6 +46,9 @@ open Mach
      Iintoffloat                R       S
      Ispecific(Ilea)            R       R       R
      Ispecific(Ifloatarithmem)  R       R       R
+     Ispecific(Icrc32q)         R       R       S   (and Res = Arg1)
+     Ispecific(Irdtsc)          R                   (and Res = rdx)
+     Ispecific(Irdpmc)          R       R           (and Res = rdx, Arg1 = rcx)
 
    Conditional branches:
      Iinttest                           S       R
@@ -86,6 +89,16 @@ method! reload_operation op arg res =
       if stackp arg.(0)
       then (let r = self#makereg arg.(0) in ([|r; arg.(1)|], [|r|]))
       else (arg, res)
+  | Ispecific (Irdtsc | Irdpmc) ->
+    (* Irdtsc: res(0) already forced in reg.
+       Irdpmc: res(0) and arg(0) already forced in regs. *)
+      (arg, res)
+  | Ispecific Icrc32q ->
+    (* First argument and result must be in the same register.
+       Second argument can be either in a register or on stack. *)
+      if stackp arg.(0)
+      then (let r = self#makereg arg.(0) in ([|r; arg.(1)|], [|r|]))
+      else (arg, res)
   | Ifloatofint | Iintoffloat ->
       (* Result must be in register, but argument can be on stack *)
       (arg, (if stackp res.(0) then [| self#makereg res.(0) |] else res))
@@ -97,7 +110,15 @@ method! reload_operation op arg res =
       if !Clflags.pic_code || !Clflags.dlcode || Arch.win64
       then super#reload_operation op arg res
       else (arg, res)
-  | _ -> (* Other operations: all args and results in registers *)
+  | Iintop (Ipopcnt | Iclz _| Ictz _)
+  | Ispecific  (Isqrtf | Isextend32 | Izextend32 | Ilea _
+               | Istore_int (_, _, _)
+               | Ioffset_loc (_, _) | Ifloatarithmem (_, _)
+               | Ibswap _| Ifloatsqrtf _)
+  | Imove|Ispill|Ireload|Inegf|Iabsf|Iconst_float _|Icall_ind _|Icall_imm _
+  | Itailcall_ind _|Itailcall_imm _|Iextcall _|Istackoffset _|Iload (_, _)
+  | Istore (_, _, _)|Ialloc _|Iname_for_debugger _|Iprobe _|Iprobe_is_enabled _
+    -> (* Other operations: all args and results in registers *)
       super#reload_operation op arg res
 
 method! reload_test tst arg =

diff --git a/backend/amd64/selection.ml b/backend/amd64/selection.ml
@@ -96,6 +96,7 @@ let pseudoregs_for_operation op arg res =
       ([| rax |], [| rax |])
   (* For imulq, first arg must be in rax, rax is clobbered, and result is in
      rdx. *)
+  | Ispecific (Ibswap _) -> assert false
   | Iintop(Imulh) ->
       ([| rax; arg.(1) |], [| rdx |])
   | Ispecific(Ifloatarithmem(_,_)) ->
@@ -112,8 +113,30 @@ let pseudoregs_for_operation op arg res =
       ([| rax; rcx |], [| rax |])
   | Iintop(Imod) ->
       ([| rax; rcx |], [| rdx |])
+  | Ispecific Irdtsc ->
+  (* For rdtsc instruction, the result is in edx (high) and eax (low).
+     Make it simple and force the result in rdx and rax clobbered. *)
+    ([| |], [| rdx |])
+  | Ispecific Irdpmc ->
+  (* For rdpmc instruction, the argument must be in ecx
+     and the result is in edx (high) and eax (low).
+     Make it simple and force the argument in rcx, the result in rdx,
+     and rax clobbered *)
+    ([| rcx |], [| rdx |])
+  | Ispecific Icrc32q ->
+    (* arg.(0) and res.(0) must be the same *)
+    ([|res.(0); arg.(1)|], res)
   (* Other instructions are regular *)
-  | _ -> raise Use_default
+  | Iintop (Ipopcnt|Iclz _|Ictz _|Icomp _|Icheckbound _)
+  | Iintop_imm ((Imulh|Idiv|Imod|Icomp _|Icheckbound _
+                |Ipopcnt|Iclz _|Ictz _), _)
+  | Ispecific (Isqrtf|Isextend32|Izextend32|Ilea _|Istore_int (_, _, _)
+              |Ioffset_loc (_, _)|Ifloatsqrtf _)
+  | Imove|Ispill|Ireload|Ifloatofint|Iintoffloat|Iconst_int _|Iconst_float _
+  | Iconst_symbol _|Icall_ind _|Icall_imm _|Itailcall_ind _|Itailcall_imm _
+  | Iextcall _|Istackoffset _|Iload (_, _)|Istore (_, _, _)|Ialloc _
+  | Iname_for_debugger _|Iprobe _|Iprobe_is_enabled _
+    -> raise Use_default
 
 (* If you update [inline_ops], you may need to update [is_simple_expr] and/or
    [effects_of], below. *)
@@ -210,7 +233,17 @@ method! select_operation op args dbg =
          (Ispecific Isqrtf, [arg])
      | _ ->
          assert false
-     end
+    end
+  | Cextcall { name; builtin = true; ret; label_after } ->
+      begin match name, ret with
+      | "caml_rdtsc_unboxed", [|Int|] -> Ispecific Irdtsc, args
+      | "caml_rdpmc_unboxed", [|Int|] -> Ispecific Irdpmc, args
+      | ("caml_int64_crc_unboxed", [|Int|]
+        | "caml_int_crc_untagged", [|Int|]) when !Arch.crc32_support ->
+          Ispecific Icrc32q, args
+      | _ ->
+        super#select_operation op args dbg
+      end
   (* Recognize store instructions *)
   | Cstore ((Word_int|Word_val as chunk), _init) ->
       begin match args with

diff --git a/backend/x86_ast.mli b/backend/x86_ast.mli
@@ -108,6 +108,7 @@ type instruction =
   | CMP of arg * arg
   | COMISD of arg * arg
   | CQO
+  | CRC32 of arg * arg
   | CVTSD2SI of arg * arg
   | CVTSD2SS of arg * arg
   | CVTSI2SD of arg * arg
@@ -172,6 +173,8 @@ type instruction =
   | POP of arg
   | POPCNT of arg * arg
   | PUSH of arg
+  | RDTSC
+  | RDPMC
   | RET
   | ROUNDSD of rounding * arg * arg
   | SAL of arg * arg

diff --git a/backend/x86_dsl.ml b/backend/x86_dsl.ml
@@ -122,6 +122,7 @@ module I = struct
   let cmp x y = emit (CMP (x, y))
   let comisd x y = emit (COMISD (x, y))
   let cqo () = emit CQO
+  let crc32 x y = emit (CRC32 (x, y))
   let cvtsd2ss x y = emit (CVTSD2SS (x, y))
   let cvtsi2sd x y = emit (CVTSI2SD (x, y))
   let cvtss2sd x y = emit (CVTSS2SD (x, y))
@@ -190,6 +191,8 @@ module I = struct
   let pop x = emit (POP x)
   let popcnt x y = emit (POPCNT (x, y))
   let push x = emit (PUSH x)
+  let rdtsc () = emit (RDTSC)
+  let rdpmc () = emit (RDPMC)
   let ret () = emit RET
   let sal x y = emit (SAL (x, y))
   let sar x y = emit (SAR (x, y))

diff --git a/backend/x86_dsl.mli b/backend/x86_dsl.mli
@@ -115,6 +115,7 @@ module I : sig
   val cmp: arg -> arg -> unit
   val comisd: arg -> arg -> unit
   val cqo: unit -> unit
+  val crc32 : arg -> arg -> unit
   val cvtsd2ss: arg -> arg -> unit
   val cvtsi2sd: arg -> arg -> unit
   val cvtss2sd: arg -> arg -> unit
@@ -183,6 +184,8 @@ module I : sig
   val pop: arg -> unit
   val popcnt : arg -> arg -> unit
   val push: arg -> unit
+  val rdtsc: unit -> unit
+  val rdpmc: unit -> unit
   val ret: unit -> unit
   val sal: arg -> arg -> unit
   val sar: arg -> arg -> unit

diff --git a/backend/x86_gas.ml b/backend/x86_gas.ml
@@ -131,6 +131,7 @@ let print_instr b = function
   | CMP (arg1, arg2) -> i2_s b "cmp" arg1 arg2
   | COMISD (arg1, arg2) -> i2 b "comisd" arg1 arg2
   | CQO ->  i0 b "cqto"
+  | CRC32 (arg1, arg2) -> i2_s b "crc32" arg1 arg2
   | CVTSD2SI (arg1, arg2) -> i2 b "cvtsd2si" arg1 arg2
   | CVTSD2SS (arg1, arg2) -> i2 b "cvtsd2ss" arg1 arg2
   | CVTSI2SD (arg1, arg2) -> i2 b ("cvtsi2sd" ^ suf arg1) arg1 arg2
@@ -207,6 +208,8 @@ let print_instr b = function
   | POP  arg -> i1_s b "pop" arg
   | POPCNT (arg1, arg2) -> i2_s b "popcnt" arg1 arg2
   | PUSH arg -> i1_s b "push" arg
+  | RDTSC -> i0 b "rdtsc"
+  | RDPMC -> i0 b "rdpmc"
   | RET ->  i0 b "ret"
   | ROUNDSD (r, arg1, arg2) -> i2 b (string_of_rounding r) arg1 arg2
   | SAL (arg1, arg2) -> i2_s b "sal" arg1 arg2

diff --git a/backend/x86_masm.ml b/backend/x86_masm.ml
@@ -128,6 +128,7 @@ let print_instr b = function
   | CMP (arg1, arg2) -> i2 b "cmp" arg1 arg2
   | COMISD (arg1, arg2) -> i2 b "comisd" arg1 arg2
   | CQO -> i0 b "cqo"
+  | CRC32 (arg1, arg2) -> i2 b "crc32q" arg1 arg2
   | CVTSD2SI (arg1, arg2) -> i2 b "cvtsd2si" arg1 arg2
   | CVTSD2SS (arg1, arg2) -> i2 b "cvtsd2ss" arg1 arg2
   | CVTSI2SD (arg1, arg2) -> i2 b "cvtsi2sd" arg1 arg2
@@ -199,6 +200,8 @@ let print_instr b = function
   | POP arg -> i1 b "pop" arg
   | POPCNT (arg1, arg2) -> i2 b "popcnt" arg1 arg2
   | PUSH arg -> i1 b "push" arg
+  | RDTSC  -> i0 b "rdtsc"
+  | RDPMC -> i0 b "rdpmc"
   | RET -> i0 b "ret"
   | ROUNDSD (r, arg1, arg2) -> i2 b (string_of_rounding r) arg1 arg2
   | SAL (arg1, arg2) -> i2 b "sal" arg1 arg2