Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add intrinsics for rdtsc, rdpmc, crc32 (amd64) #20

Merged
merged 4 commits into from
May 13, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add intrinsics for rdtsc, rdpmc, crc32 (amd64)
  • Loading branch information
gretay-js authored and mshinwell committed May 13, 2021
commit 49fc923acf6b0f8b7c282567d70542091c3b0e13
2 changes: 2 additions & 0 deletions backend/amd64/CSE.ml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ method! class_of_operation op =
| Ioffset_loc(_, _) -> Op_store true
| Ifloatarithmem _ | Ifloatsqrtf _ -> Op_load
| Ibswap _ | Isqrtf -> super#class_of_operation op
| Irdtsc | Irdpmc -> Op_other
| Icrc32q -> Op_pure
end
| _ -> super#class_of_operation op

Expand Down
16 changes: 16 additions & 0 deletions backend/amd64/arch.ml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
(* POPCNT instruction is not available prior to Nehalem, released in 2008. *)
let popcnt_support = ref true

(* CRC32 requires SSE 4.2 support *)
let crc32_support = ref true

(* Machine-specific command-line options *)

let command_line_options =
Expand All @@ -27,6 +30,10 @@ let command_line_options =
" Use POPCNT instruction (not available prior to Nehalem)";
"-fno-popcnt", Arg.Clear popcnt_support,
" Do not use POPCNT instruction";
"-fcrc32", Arg.Set crc32_support,
" Use CRC32 instructions (requires SSE4.2 support)";
"-fno-crc32", Arg.Clear crc32_support,
" Do not emit CRC32 instructions";
]

(* Specific operations for the AMD64 processor *)
Expand Down Expand Up @@ -54,6 +61,9 @@ type specific_operation =
extension *)
| Izextend32 (* 32 to 64 bit conversion with zero
extension *)
| Irdtsc (* read timestamp *)
| Irdpmc (* read performance counter *)
| Icrc32q (* compute crc *)

and float_operation =
Ifloatadd | Ifloatsub | Ifloatmul | Ifloatdiv
Expand Down Expand Up @@ -143,6 +153,12 @@ let print_specific_operation printreg op ppf arg =
fprintf ppf "sextend32 %a" printreg arg.(0)
| Izextend32 ->
fprintf ppf "zextend32 %a" printreg arg.(0)
| Irdtsc ->
fprintf ppf "rdtsc"
| Irdpmc ->
fprintf ppf "rdpmc %a" printreg arg.(0)
| Icrc32q ->
fprintf ppf "crc32 %a %a" printreg arg.(0) printreg arg.(1)

let win64 =
match Config.system with
Expand Down
35 changes: 35 additions & 0 deletions backend/amd64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,41 @@ let emit_instr fallthrough i =
| Lop(Iintop Ipopcnt) ->
assert (!popcnt_support);
I.popcnt (arg i 0) (res i 0)
| Lop(Ispecific Irdtsc) ->
(* XCR mshinwell: Instead of the above comment, let's write assertions to
ensure that the constraints have been satisfied. Likewise for any other
intrinsic cases below with hard register constraints. *)
assert (reg64 i.res.(0) = RDX);
I.rdtsc ();
(* The instruction fills in the low 32 bits of the result registers. *)
(* Combine edx and eax into a single 64-bit result in rdx. *)
I.sal (int 32) (res i 0); (* shift edx to the high part of rdx *)
(* XCR mshinwell: Why is this sign extension needed? (same below).

gyorsh: it looks wrong, thanks for catching it!
I think it was supposed to be zero extend.
For rdtsc it is not needed for this target
(see comment added below), even though gcc still generates it -
as a simple mov (not movzxd) on 32 bit registers, which zeros
the high bits of the corresponding 64-bit registers.
For rdpmc, it is still needed, I think, so I fixed it below
to movzx. Maybe "mov eax eax" would be better: faster and smaller code.
*)
(* On processors that support the Intel 64 architecture,
the high-order 32 bits of each of RAX and RDX are cleared. *)
I.or_ rax (res i 0) (* combine high and low into rdx *)
| Lop(Ispecific Irdpmc) ->
assert ((arg64 i 0 = RCX) && (reg64 i.res.(0) = RDX));
I.rdpmc ();
(* The instruction fills in the low 32 bits of the result registers. *)
(* Combine edx and eax into a single 64-bit result in rdx. *)
I.sal (int 32) (res i 0); (* shift edx to the high part of rdx *)
I.mov eax eax; (* zero-extend eax *)
I.or_ rax (res i 0) (* combine high and low into rdx *)
| Lop (Ispecific Icrc32q) ->
assert (arg i 0 = res i 0);
(* XCR mshinwell: write an assertion, not a comment! *)
I.crc32 (arg i 1) (res i 0)
| Lop (Iname_for_debugger _) -> ()
| Lop (Iprobe _) ->
let probe_label = new_label () in
Expand Down
1 change: 1 addition & 0 deletions backend/amd64/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ let destroyed_at_oper = function
[| loc_spacetime_node_hole |]
| Iswitch(_, _) -> [| rax; rdx |]
| Itrywith _ -> [| r11 |]
| Iop(Ispecific (Irdtsc | Irdpmc)) -> [| rax |]
| _ ->
if fp then
(* prevent any use of the frame pointer ! *)
Expand Down
56 changes: 55 additions & 1 deletion backend/amd64/reload.ml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ open Mach
Iintoffloat R S
Ispecific(Ilea) R R R
Ispecific(Ifloatarithmem) R R R
Ispecific(Icrc32q) R R S (and Res = Arg1)
Ispecific(Irdtsc) R (and Res = rdx)
Ispecific(Irdpmc) R R (and Res = rdx, Arg1 = rcx)

Conditional branches:
Iinttest S R
Expand Down Expand Up @@ -86,6 +89,42 @@ method! reload_operation op arg res =
if stackp arg.(0)
then (let r = self#makereg arg.(0) in ([|r; arg.(1)|], [|r|]))
else (arg, res)
(* CR mshinwell for mshinwell: re-read the next three cases once
comments addressed *)
| Ispecific (Irdtsc | Irdpmc) ->
(* XCR mshinwell: The table in the comment needs updating for this
operation and the next one below *)
(* XCR mshinwell: I don't think this [force] function is needed. I'm not
clear on the exact details, but these cases are very similar to
Iintop (Idiv | Imod | Imulh) -- see selection.ml and the comment above
which says "already forced in regs". *)
(* XCR mshinwell: Same as previous case. *)
(* Irdtsc: res(0) already forced in reg.
Irdpmc: res(0) and arg(0) already forced in regs. *)
(arg, res)
| Ispecific Icrc32q ->
(* First argument and result must be in the same register.
Second argument can be either in a register or on stack. *)
(* XCR mshinwell: I think something is missing in selection.ml here.
Look for example at the Iintop Imulf case. That has a case in
selection.ml to ensure that the register constraint is satisfied.
I suspect that if we add a similar case in [pseudoregs_for_operation]
for Icrc32q, then [first_arg_and_res_overlap] will always be [true]
here. The following code should then be much more straightforward.
I think the way to think about the distinction between selection.ml
and this file is that the former should establish any necessary
hard register or equality constraints; and this file only needs to deal
with fixing things up if an operand or result that has to be in
a register has thus far been assigned to the stack.
(In the case of selection.ml establishing a hard register constraint,
the allocator presumably never changes that, which is why e.g. in the
two cases above for the timestamp counters we shouldn't need any kind
of forcing function here.)

gretay: fixed, thank you for explaining it!*)
if stackp arg.(0)
then (let r = self#makereg arg.(0) in ([|r; arg.(1)|], [|r|]))
else (arg, res)
| Ifloatofint | Iintoffloat ->
(* Result must be in register, but argument can be on stack *)
(arg, (if stackp res.(0) then [| self#makereg res.(0) |] else res))
Expand All @@ -97,7 +136,22 @@ method! reload_operation op arg res =
if !Clflags.pic_code || !Clflags.dlcode || Arch.win64
then super#reload_operation op arg res
else (arg, res)
| _ -> (* Other operations: all args and results in registers *)
(* XCR mshinwell: Since we're here, let's please turn this into an
exhaustive match. (Maybe you could make a separate patch to do that.)
This will increase confidence that we haven't missed any cases as we
add intrinsics. Please also do the same for pseudoregs_for_operation in
selection.ml.

gyorsh: done. do you mean as a separate patch upstream? *)
| Iintop (Ipopcnt | Iclz _| Ictz _)
| Ispecific (Isqrtf | Isextend32 | Izextend32 | Ilea _
| Istore_int (_, _, _)
| Ioffset_loc (_, _) | Ifloatarithmem (_, _)
| Ibswap _| Ifloatsqrtf _)
| Imove|Ispill|Ireload|Inegf|Iabsf|Iconst_float _|Icall_ind _|Icall_imm _
| Itailcall_ind _|Itailcall_imm _|Iextcall _|Istackoffset _|Iload (_, _)
| Istore (_, _, _)|Ialloc _|Iname_for_debugger _|Iprobe _|Iprobe_is_enabled _
-> (* Other operations: all args and results in registers *)
super#reload_operation op arg res

method! reload_test tst arg =
Expand Down
57 changes: 55 additions & 2 deletions backend/amd64/selection.ml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ let pseudoregs_for_operation op arg res =
([| rax |], [| rax |])
(* For imulq, first arg must be in rax, rax is clobbered, and result is in
rdx. *)
| Ispecific (Ibswap _) -> assert false
| Iintop(Imulh) ->
([| rax; arg.(1) |], [| rdx |])
| Ispecific(Ifloatarithmem(_,_)) ->
Expand All @@ -112,8 +113,30 @@ let pseudoregs_for_operation op arg res =
([| rax; rcx |], [| rax |])
| Iintop(Imod) ->
([| rax; rcx |], [| rdx |])
| Ispecific Irdtsc ->
(* For rdtsc instruction, the result is in edx (high) and eax (low).
Make it simple and force the result in rdx and rax clobbered. *)
([| |], [| rdx |])
| Ispecific Irdpmc ->
(* For rdpmc instruction, the argument must be in ecx
and the result is in edx (high) and eax (low).
Make it simple and force the argument in rcx, the result in rdx,
and rax clobbered *)
([| rcx |], [| rdx |])
| Ispecific Icrc32q ->
(* arg.(0) and res.(0) must be the same *)
([|res.(0); arg.(1)|], res)
(* Other instructions are regular *)
| _ -> raise Use_default
| Iintop (Ipopcnt|Iclz _|Ictz _|Icomp _|Icheckbound _)
| Iintop_imm ((Imulh|Idiv|Imod|Icomp _|Icheckbound _
|Ipopcnt|Iclz _|Ictz _), _)
| Ispecific (Isqrtf|Isextend32|Izextend32|Ilea _|Istore_int (_, _, _)
|Ioffset_loc (_, _)|Ifloatsqrtf _)
| Imove|Ispill|Ireload|Ifloatofint|Iintoffloat|Iconst_int _|Iconst_float _
| Iconst_symbol _|Icall_ind _|Icall_imm _|Itailcall_ind _|Itailcall_imm _
| Iextcall _|Istackoffset _|Iload (_, _)|Istore (_, _, _)|Ialloc _
| Iname_for_debugger _|Iprobe _|Iprobe_is_enabled _
-> raise Use_default

(* If you update [inline_ops], you may need to update [is_simple_expr] and/or
[effects_of], below. *)
Expand Down Expand Up @@ -210,7 +233,37 @@ method! select_operation op args dbg =
(Ispecific Isqrtf, [arg])
| _ ->
assert false
end
end
(* CR mshinwell for mshinwell: Re-read this case after first round of
review *)
| Cextcall { name; builtin = true; ret; label_after } ->
(* XCR mshinwell: The standard in this file is unfortunately two more
spaces of indentation for match cases; let's follow that for
consistency.

gyorsh: fixed.
*)
(* XCR mshinwell: Let's please remove the tuple brackets on these return
values; they aren't needed, and tend to clutter. This is quite
complicated to look at as it is...

gyorsh: fixed in the new code.
Should I also fix it in the rest of the function, or leave it as is?
*)
(* XCR mshinwell: Can we check [ret] in all of these cases? It's presumably
uniquely defined in each case.

gyorsh: ah, that's a great idea! added.
*)
begin match name, ret with
| "caml_rdtsc_unboxed", [|Int|] -> Ispecific Irdtsc, args
| "caml_rdpmc_unboxed", [|Int|] -> Ispecific Irdpmc, args
| ("caml_int64_crc_unboxed", [|Int|]
| "caml_int_crc_untagged", [|Int|]) when !Arch.crc32_support ->
Ispecific Icrc32q, args
| _ ->
super#select_operation op args dbg
end
(* Recognize store instructions *)
| Cstore ((Word_int|Word_val as chunk), _init) ->
begin match args with
Expand Down