Add intrinsics for rdtsc, rdpmc, crc32 (amd64)

ocaml-flambda · mshinwell · May 13, 2021 · Apr 28, 2021 · Apr 28, 2021 · Apr 28, 2021
commit 49fc923acf6b0f8b7c282567d70542091c3b0e13
diff --git a/backend/amd64/CSE.ml b/backend/amd64/CSE.ml
@@ -32,6 +32,8 @@ method! class_of_operation op =
     | Ioffset_loc(_, _) -> Op_store true
     | Ifloatarithmem _ | Ifloatsqrtf _ -> Op_load
     | Ibswap _ | Isqrtf -> super#class_of_operation op
+    | Irdtsc | Irdpmc -> Op_other
+    | Icrc32q -> Op_pure
     end
   | _ -> super#class_of_operation op
 

diff --git a/backend/amd64/arch.ml b/backend/amd64/arch.ml
@@ -16,6 +16,9 @@
 (* POPCNT instruction is not available prior to Nehalem, released in 2008. *)
 let popcnt_support = ref true
 
+(* CRC32 requires SSE 4.2 support *)
+let crc32_support = ref true
+
 (* Machine-specific command-line options *)
 
 let command_line_options =
@@ -27,6 +30,10 @@ let command_line_options =
       " Use POPCNT instruction (not available prior to Nehalem)";
     "-fno-popcnt", Arg.Clear popcnt_support,
       " Do not use POPCNT instruction";
+    "-fcrc32", Arg.Set crc32_support,
+      " Use CRC32 instructions (requires SSE4.2 support)";
+    "-fno-crc32", Arg.Clear crc32_support,
+      " Do not emit CRC32 instructions";
   ]
 
 (* Specific operations for the AMD64 processor *)
@@ -54,6 +61,9 @@ type specific_operation =
                                           extension *)
   | Izextend32                         (* 32 to 64 bit conversion with zero
                                           extension *)
+  | Irdtsc                             (* read timestamp *)
+  | Irdpmc                             (* read performance counter *)
+  | Icrc32q                            (* compute crc *)
 
 and float_operation =
     Ifloatadd | Ifloatsub | Ifloatmul | Ifloatdiv
@@ -143,6 +153,12 @@ let print_specific_operation printreg op ppf arg =
       fprintf ppf "sextend32 %a" printreg arg.(0)
   | Izextend32 ->
       fprintf ppf "zextend32 %a" printreg arg.(0)
+  | Irdtsc ->
+      fprintf ppf "rdtsc"
+  | Irdpmc ->
+      fprintf ppf "rdpmc %a" printreg arg.(0)
+  | Icrc32q ->
+      fprintf ppf "crc32 %a %a" printreg arg.(0) printreg arg.(1)
 
 let win64 =
   match Config.system with

diff --git a/backend/amd64/emit.mlp b/backend/amd64/emit.mlp
@@ -958,6 +958,41 @@ let emit_instr fallthrough i =
   | Lop(Iintop Ipopcnt) ->
       assert (!popcnt_support);
       I.popcnt (arg i 0) (res i 0)
+  | Lop(Ispecific Irdtsc) ->
+    (* XCR mshinwell: Instead of the above comment, let's write assertions to
+       ensure that the constraints have been satisfied.  Likewise for any other
+       intrinsic cases below with hard register constraints. *)
+    assert (reg64 i.res.(0) = RDX);
+    I.rdtsc ();
+    (* The instruction fills in the low 32 bits of the result registers. *)
+    (* Combine edx and eax into a single 64-bit result in rdx. *)
+    I.sal (int 32) (res i 0); (* shift edx to the high part of rdx *)
+    (* XCR mshinwell: Why is this sign extension needed?  (same below).
+
+       gyorsh: it looks wrong, thanks for catching it!
+       I think it was supposed to be zero extend.
+       For rdtsc it is not needed for this target
+       (see comment added below), even though gcc still generates it -
+       as a simple mov (not movzxd) on 32 bit registers, which zeros
+       the high bits of the corresponding 64-bit registers.
+       For rdpmc, it is still needed, I think, so I fixed it below
+       to movzx. Maybe "mov eax eax" would be better: faster and smaller code.
+    *)
+    (* On processors that support the Intel 64 architecture,
+       the high-order 32 bits of each of RAX and RDX are cleared. *)
+    I.or_ rax (res i 0) (* combine high and low into rdx *)
+  | Lop(Ispecific Irdpmc) ->
+    assert ((arg64 i 0 = RCX) && (reg64 i.res.(0) = RDX));
+    I.rdpmc ();
+    (* The instruction fills in the low 32 bits of the result registers. *)
+    (* Combine edx and eax into a single 64-bit result in rdx. *)
+    I.sal (int 32) (res i 0); (* shift edx to the high part of rdx *)
+    I.mov eax eax; (* zero-extend eax *)
+    I.or_ rax (res i 0) (* combine high and low into rdx *)
+  | Lop (Ispecific Icrc32q) ->
+    assert (arg i 0 = res i 0);
+    (* XCR mshinwell: write an assertion, not a comment! *)
+    I.crc32 (arg i 1) (res i 0)
   | Lop (Iname_for_debugger _) -> ()
   | Lop (Iprobe _) ->
     let probe_label = new_label () in

diff --git a/backend/amd64/proc.ml b/backend/amd64/proc.ml
@@ -332,6 +332,7 @@ let destroyed_at_oper = function
       [| loc_spacetime_node_hole |]
   | Iswitch(_, _) -> [| rax; rdx |]
   | Itrywith _ -> [| r11 |]
+  | Iop(Ispecific (Irdtsc | Irdpmc)) -> [| rax |]
   | _ ->
     if fp then
 (* prevent any use of the frame pointer ! *)

diff --git a/backend/amd64/reload.ml b/backend/amd64/reload.ml
@@ -46,6 +46,9 @@ open Mach
      Iintoffloat                R       S
      Ispecific(Ilea)            R       R       R
      Ispecific(Ifloatarithmem)  R       R       R
+     Ispecific(Icrc32q)         R       R       S   (and Res = Arg1)
+     Ispecific(Irdtsc)          R                   (and Res = rdx)
+     Ispecific(Irdpmc)          R       R           (and Res = rdx, Arg1 = rcx)
 
    Conditional branches:
      Iinttest                           S       R
@@ -86,6 +89,42 @@ method! reload_operation op arg res =
       if stackp arg.(0)
       then (let r = self#makereg arg.(0) in ([|r; arg.(1)|], [|r|]))
       else (arg, res)
+  (* CR mshinwell for mshinwell: re-read the next three cases once
+     comments addressed *)
+  | Ispecific (Irdtsc | Irdpmc) ->
+    (* XCR mshinwell: The table in the comment needs updating for this
+       operation and the next one below *)
+    (* XCR mshinwell: I don't think this [force] function is needed.  I'm not
+       clear on the exact details, but these cases are very similar to
+       Iintop (Idiv | Imod | Imulh) -- see selection.ml and the comment above
+       which says "already forced in regs". *)
+    (* XCR mshinwell: Same as previous case. *)
+    (* Irdtsc: res(0) already forced in reg.
+       Irdpmc: res(0) and arg(0) already forced in regs. *)
+      (arg, res)
+  | Ispecific Icrc32q ->
+    (* First argument and result must be in the same register.
+       Second argument can be either in a register or on stack. *)
+    (* XCR mshinwell: I think something is missing in selection.ml here.
+       Look for example at the Iintop Imulf case.  That has a case in
+       selection.ml to ensure that the register constraint is satisfied.
+       I suspect that if we add a similar case in [pseudoregs_for_operation]
+       for Icrc32q, then [first_arg_and_res_overlap] will always be [true]
+       here.  The following code should then be much more straightforward.
+       I think the way to think about the distinction between selection.ml
+       and this file is that the former should establish any necessary
+       hard register or equality constraints; and this file only needs to deal
+       with fixing things up if an operand or result that has to be in
+       a register has thus far been assigned to the stack.
+       (In the case of selection.ml establishing a hard register constraint,
+       the allocator presumably never changes that, which is why e.g. in the
+       two cases above for the timestamp counters we shouldn't need any kind
+       of forcing function here.)
+
+      gretay: fixed, thank you for explaining it!*)
+      if stackp arg.(0)
+      then (let r = self#makereg arg.(0) in ([|r; arg.(1)|], [|r|]))
+      else (arg, res)
   | Ifloatofint | Iintoffloat ->
       (* Result must be in register, but argument can be on stack *)
       (arg, (if stackp res.(0) then [| self#makereg res.(0) |] else res))
@@ -97,7 +136,22 @@ method! reload_operation op arg res =
       if !Clflags.pic_code || !Clflags.dlcode || Arch.win64
       then super#reload_operation op arg res
       else (arg, res)
-  | _ -> (* Other operations: all args and results in registers *)
+  (* XCR mshinwell: Since we're here, let's please turn this into an
+     exhaustive match.  (Maybe you could make a separate patch to do that.)
+     This will increase confidence that we haven't missed any cases as we
+     add intrinsics.  Please also do the same for pseudoregs_for_operation in
+     selection.ml.
+
+     gyorsh: done. do you mean as a separate patch upstream?  *)
+  | Iintop (Ipopcnt | Iclz _| Ictz _)
+  | Ispecific  (Isqrtf | Isextend32 | Izextend32 | Ilea _
+               | Istore_int (_, _, _)
+               | Ioffset_loc (_, _) | Ifloatarithmem (_, _)
+               | Ibswap _| Ifloatsqrtf _)
+  | Imove|Ispill|Ireload|Inegf|Iabsf|Iconst_float _|Icall_ind _|Icall_imm _
+  | Itailcall_ind _|Itailcall_imm _|Iextcall _|Istackoffset _|Iload (_, _)
+  | Istore (_, _, _)|Ialloc _|Iname_for_debugger _|Iprobe _|Iprobe_is_enabled _
+    -> (* Other operations: all args and results in registers *)
       super#reload_operation op arg res
 
 method! reload_test tst arg =

diff --git a/backend/amd64/selection.ml b/backend/amd64/selection.ml
@@ -96,6 +96,7 @@ let pseudoregs_for_operation op arg res =
       ([| rax |], [| rax |])
   (* For imulq, first arg must be in rax, rax is clobbered, and result is in
      rdx. *)
+  | Ispecific (Ibswap _) -> assert false
   | Iintop(Imulh) ->
       ([| rax; arg.(1) |], [| rdx |])
   | Ispecific(Ifloatarithmem(_,_)) ->
@@ -112,8 +113,30 @@ let pseudoregs_for_operation op arg res =
       ([| rax; rcx |], [| rax |])
   | Iintop(Imod) ->
       ([| rax; rcx |], [| rdx |])
+  | Ispecific Irdtsc ->
+  (* For rdtsc instruction, the result is in edx (high) and eax (low).
+     Make it simple and force the result in rdx and rax clobbered. *)
+    ([| |], [| rdx |])
+  | Ispecific Irdpmc ->
+  (* For rdpmc instruction, the argument must be in ecx
+     and the result is in edx (high) and eax (low).
+     Make it simple and force the argument in rcx, the result in rdx,
+     and rax clobbered *)
+    ([| rcx |], [| rdx |])
+  | Ispecific Icrc32q ->
+    (* arg.(0) and res.(0) must be the same *)
+    ([|res.(0); arg.(1)|], res)
   (* Other instructions are regular *)
-  | _ -> raise Use_default
+  | Iintop (Ipopcnt|Iclz _|Ictz _|Icomp _|Icheckbound _)
+  | Iintop_imm ((Imulh|Idiv|Imod|Icomp _|Icheckbound _
+                |Ipopcnt|Iclz _|Ictz _), _)
+  | Ispecific (Isqrtf|Isextend32|Izextend32|Ilea _|Istore_int (_, _, _)
+              |Ioffset_loc (_, _)|Ifloatsqrtf _)
+  | Imove|Ispill|Ireload|Ifloatofint|Iintoffloat|Iconst_int _|Iconst_float _
+  | Iconst_symbol _|Icall_ind _|Icall_imm _|Itailcall_ind _|Itailcall_imm _
+  | Iextcall _|Istackoffset _|Iload (_, _)|Istore (_, _, _)|Ialloc _
+  | Iname_for_debugger _|Iprobe _|Iprobe_is_enabled _
+    -> raise Use_default
 
 (* If you update [inline_ops], you may need to update [is_simple_expr] and/or
    [effects_of], below. *)
@@ -210,7 +233,37 @@ method! select_operation op args dbg =
          (Ispecific Isqrtf, [arg])
      | _ ->
          assert false
-     end
+    end
+  (* CR mshinwell for mshinwell: Re-read this case after first round of
+     review *)
+  | Cextcall { name; builtin = true; ret; label_after } ->
+    (* XCR mshinwell: The standard in this file is unfortunately two more
+       spaces of indentation for match cases; let's follow that for
+       consistency.
+
+       gyorsh: fixed.
+    *)
+    (* XCR mshinwell: Let's please remove the tuple brackets on these return
+       values; they aren't needed, and tend to clutter.  This is quite
+       complicated to look at as it is...
+
+       gyorsh: fixed in the new code.
+       Should I also fix it in the rest of the function, or leave it as is?
+    *)
+    (* XCR mshinwell: Can we check [ret] in all of these cases?  It's presumably
+       uniquely defined in each case.
+
+       gyorsh: ah, that's a great idea! added.
+    *)
+      begin match name, ret with
+      | "caml_rdtsc_unboxed", [|Int|] -> Ispecific Irdtsc, args
+      | "caml_rdpmc_unboxed", [|Int|] -> Ispecific Irdpmc, args
+      | ("caml_int64_crc_unboxed", [|Int|]
+        | "caml_int_crc_untagged", [|Int|]) when !Arch.crc32_support ->
+          Ispecific Icrc32q, args
+      | _ ->
+        super#select_operation op args dbg
+      end
   (* Recognize store instructions *)
   | Cstore ((Word_int|Word_val as chunk), _init) ->
       begin match args with