Skip to content

Commit

Permalink
Enable locals for arm64 (ocaml-flambda#2442)
Browse files Browse the repository at this point in the history
* Implement local allocation for arm64

This patch implements local allocation by adding code emission for local
allocation, regions and assembly routine for calling stack relocation.

(cherry picked from commit 0694909)

* Minor fixes to backend/arm64/emit.mlp

* Copy runtime5 implementation of caml_call_local_realloc to runtime4

* Revert ocaml/asmcomp changes

* Code size estimate for local allocs

* Line lengths

* Tweak boundary condition for allocation to match amd64 emitter

* New runtime5 implementation for caml_call_local_realloc

---------

Co-authored-by: Anmol Sahoo <anmol.sahoo25@gmail.com>
  • Loading branch information
mshinwell and anmolsahoo25 authored Apr 19, 2024
1 parent 9fde93b commit 7c4a510
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 44 deletions.
136 changes: 92 additions & 44 deletions backend/arm64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,22 @@ let emit_call_gc gc =
`{emit_label gc.gc_lbl}: bl {emit_symbol "caml_call_gc"}\n`;
`{emit_label gc.gc_frame_lbl}: b {emit_label gc.gc_return_lbl}\n`

(* Record calls to local stack reallocation *)

type local_realloc_call =
{ lr_lbl: label;
lr_return_lbl: label;
lr_dbg: Debuginfo.t
}

let local_realloc_sites = ref ([] : local_realloc_call list)

let emit_local_realloc lr =
`{emit_label lr.lr_lbl}:\n`;
` {emit_debug_info lr.lr_dbg}\n`;
` bl {emit_symbol "caml_call_local_realloc"}\n`;
` b {emit_label lr.lr_return_lbl}\n`

(* Names of various instructions *)

let name_for_comparison = function
Expand Down Expand Up @@ -374,7 +390,7 @@ let num_call_gc_points instr =
let rec loop instr call_gc =
match instr.desc with
| Lend -> call_gc
| Lop (Ialloc _) when !fastcode_flag ->
| Lop (Ialloc { mode = Alloc_heap; _ }) when !fastcode_flag ->
loop instr.next (call_gc + 1)
| Lop (Ipoll _) ->
loop instr.next (call_gc + 1)
Expand Down Expand Up @@ -480,6 +496,7 @@ module BR = Branch_relaxation.Make (struct
| _ -> 0
and single = match memory_chunk with Single -> 2 | _ -> 1 in
based + barrier + single
| Lop (Ialloc { mode = Alloc_local; _ }) -> 9
| Lop (Ialloc _) when !fastcode_flag -> 5
| Lop (Ispecific (Ifar_alloc _)) when !fastcode_flag -> 6
| Lop (Ipoll _) -> 3
Expand All @@ -491,8 +508,7 @@ module BR = Branch_relaxation.Make (struct
| _ -> 1 + num_instructions_for_intconst (Nativeint.of_int num_bytes)
end
| Lop (Icsel _) -> 4
| Lop (Ibeginregion | Iendregion) ->
Misc.fatal_error "Local allocations not supported on this architecture"
| Lop (Ibeginregion | Iendregion) -> 1
| Lop (Iintop (Icomp _)) -> 2
| Lop (Ifloatop (Icompf _)) -> 2
| Lop (Iintop_imm (Icomp _, _)) -> 2
Expand Down Expand Up @@ -569,44 +585,68 @@ let name_for_float_comparison = function

(* Output the assembly code for allocation. *)

let assembly_code_for_allocation i ~n ~far ~dbginfo =
let lbl_frame =
record_frame_label i.live (Dbg_alloc dbginfo)
in
if !fastcode_flag then begin
let lbl_after_alloc = new_label() in
let lbl_call_gc = new_label() in
(* n is at most Max_young_whsize * 8, i.e. currently 0x808,
so it is reasonable to assume n < 0x1_000. This makes
the generated code simpler. *)
assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
if not far then begin
` b.lo {emit_label lbl_call_gc}\n`
end else begin
let lbl = new_label () in
` b.cs {emit_label lbl}\n`;
` b {emit_label lbl_call_gc}\n`;
`{emit_label lbl}:\n`
end;
`{emit_label lbl_after_alloc}:`;
` add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
call_gc_sites :=
{ gc_lbl = lbl_call_gc;
gc_return_lbl = lbl_after_alloc;
gc_frame_lbl = lbl_frame } :: !call_gc_sites
let assembly_code_for_allocation i ~local ~n ~far ~dbginfo =
if local then begin
let r = i.res.(0) in
let module DS = Domainstate in
let domain_local_sp_offset = DS.(idx_of_field Domain_local_sp) * 8 in
let domain_local_limit_offset = DS.(idx_of_field Domain_local_limit) * 8 in
let domain_local_top_offset = DS.(idx_of_field Domain_local_top) * 8 in
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_limit_offset}]\n`;
` ldr {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
` sub {emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
` str {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
` cmp {emit_reg r}, {emit_reg reg_tmp1}\n`;
let lbl_call = new_label () in
` b.lt {emit_label lbl_call}\n`;
let lbl_after_alloc = new_label () in
`{emit_label lbl_after_alloc}:\n`;
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_top_offset}]\n`;
` add {emit_reg r}, {emit_reg r}, {emit_reg reg_tmp1}\n`;
` add {emit_reg r}, {emit_reg r}, #{emit_int 8}\n`;
local_realloc_sites :=
{ lr_lbl = lbl_call;
lr_dbg = i.dbg;
lr_return_lbl = lbl_after_alloc } :: !local_realloc_sites
end else begin
begin match n with
| 16 -> ` bl {emit_symbol "caml_alloc1"}\n`
| 24 -> ` bl {emit_symbol "caml_alloc2"}\n`
| 32 -> ` bl {emit_symbol "caml_alloc3"}\n`
| _ -> emit_intconst reg_x8 (Nativeint.of_int n);
` bl {emit_symbol "caml_allocN"}\n`
end;
`{emit_label lbl_frame}: add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`
let lbl_frame =
record_frame_label i.live (Dbg_alloc dbginfo)
in
if !fastcode_flag then begin
let lbl_after_alloc = new_label() in
let lbl_call_gc = new_label() in
(* n is at most Max_young_whsize * 8, i.e. currently 0x808,
so it is reasonable to assume n < 0x1_000. This makes
the generated code simpler. *)
assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
if not far then begin
` b.lo {emit_label lbl_call_gc}\n`
end else begin
let lbl = new_label () in
` b.cs {emit_label lbl}\n`;
` b {emit_label lbl_call_gc}\n`;
`{emit_label lbl}:\n`
end;
`{emit_label lbl_after_alloc}:`;
` add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
call_gc_sites :=
{ gc_lbl = lbl_call_gc;
gc_return_lbl = lbl_after_alloc;
gc_frame_lbl = lbl_frame } :: !call_gc_sites
end else begin
begin match n with
| 16 -> ` bl {emit_symbol "caml_alloc1"}\n`
| 24 -> ` bl {emit_symbol "caml_alloc2"}\n`
| 32 -> ` bl {emit_symbol "caml_alloc3"}\n`
| _ -> emit_intconst reg_x8 (Nativeint.of_int n);
` bl {emit_symbol "caml_allocN"}\n`
end;
`{emit_label lbl_frame}: add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`
end
end

let assembly_code_for_poll i ~far ~return_label =
Expand Down Expand Up @@ -851,11 +891,17 @@ let emit_instr i =
fatal_error "arm64: got 128 bit memory chunk"
end
| Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_heap }) ->
assembly_code_for_allocation i ~n ~far:false ~dbginfo
assembly_code_for_allocation i ~n ~local:false ~far:false ~dbginfo
| Lop(Ispecific (Ifar_alloc { bytes = n; dbginfo })) ->
assembly_code_for_allocation i ~n ~far:true ~dbginfo
| Lop(Ialloc { mode = Alloc_local } | Ibeginregion | Iendregion) ->
Misc.fatal_error "Local allocations not supported on this architecture"
assembly_code_for_allocation i ~n ~local:false ~far:true ~dbginfo
| Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_local }) ->
assembly_code_for_allocation i ~n ~local:true ~far:false ~dbginfo
| Lop(Ibeginregion) ->
let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
` ldr {emit_reg i.res.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
| Lop(Iendregion) ->
let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
` str {emit_reg i.arg.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
| Lop(Ipoll { return_label }) ->
assembly_code_for_poll i ~far:false ~return_label
| Lop(Ispecific (Ifar_poll { return_label })) ->
Expand Down Expand Up @@ -1112,6 +1158,7 @@ let fundecl fundecl =
float_literals := [];
stack_offset := 0;
call_gc_sites := [];
local_realloc_sites := [];
for i = 0 to Proc.num_stack_slot_classes - 1 do
num_stack_slots.(i) <- fundecl.fun_num_stack_slots.(i);
done;
Expand Down Expand Up @@ -1171,6 +1218,7 @@ let fundecl fundecl =

emit_all fundecl.fun_body;
List.iter emit_call_gc !call_gc_sites;
List.iter emit_local_realloc !local_realloc_sites;
assert (List.length !call_gc_sites = num_call_gc);
(match fun_end_label with
| None -> ()
Expand Down
23 changes: 23 additions & 0 deletions ocaml/runtime/arm64.S
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,29 @@ FUNCTION(caml_allocN)
CFI_ENDPROC
END_FUNCTION(caml_allocN)

/* Reallocate the locals stack. This is like caml_call_gc, above. */
FUNCTION(caml_call_local_realloc)
CFI_STARTPROC
L(caml_call_local_realloc):
/* Save return address and frame pointer */
CFI_OFFSET(29, -16)
CFI_OFFSET(30, -8)
stp x29, x30, [sp, -16]!
CFI_ADJUST(16)
add x29, sp, #0
/* Store all registers (including ALLOC_PTR & TRAP_PTR) */
SAVE_ALL_REGS
SWITCH_OCAML_TO_C
/* Call the runtime to reallocate the local stack */
bl G(caml_local_realloc)
SWITCH_C_TO_OCAML
RESTORE_ALL_REGS
/* Free stack space and return to caller */
ldp x29, x30, [sp], 16
ret
CFI_ENDPROC
END_FUNCTION(caml_call_gc)

/* Call a C function from OCaml */
/* Function to call is in ADDITIONAL_ARG */

Expand Down
85 changes: 85 additions & 0 deletions ocaml/runtime4/arm64.S
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,91 @@ FUNCTION(caml_allocN)
CFI_ENDPROC
END_FUNCTION(caml_allocN)

FUNCTION(caml_call_local_realloc)
L(caml_call_local_realloc):
CFI_STARTPROC
/* Set up stack space, saving return address and frame pointer */
/* Store return address and frame pointer */
/* (2 RA/GP, 24 allocatable int regs, 24 caller-saved float regs) * 8 */
CFI_OFFSET(29,-400)
CFI_OFFSET(30,-392)
stp x29, x30, [sp,-400]! /* pre-indexing stp */
CFI_ADJUST(400)
add x29, sp, #0

/* Save allocatable integer registers on the stack, using order in proc.ml */
stp x0, x1, [sp, 16]
stp x2, x3, [sp, 32]
stp x4, x5, [sp, 48]
stp x6, x7, [sp, 64]
stp x8, x9, [sp, 80]
stp x10, x11, [sp, 96]
stp x12, x13, [sp, 112]
stp x14, x15, [sp, 128]
stp x19, x20, [sp, 144]
stp x21, x22, [sp, 160]
stp x23, x24, [sp, 176]
str x25, [sp, 192]

/* Save caller saved floating-point registers on the stack */
stp d0, d1, [sp, 208]
stp d2, d3, [sp, 224]
stp d4, d5, [sp, 240]
stp d6, d7, [sp, 256]
stp d16, d17, [sp, 272]
stp d18, d19, [sp, 288]
stp d20, d21, [sp, 304]
stp d22, d23, [sp, 320]
stp d24, d25, [sp, 336]
stp d26, d27, [sp, 352]
stp d28, d29, [sp, 368]
stp d30, d31, [sp, 384]

/* Store pointer to saved integer registers in Caml_state->gc_regs */
add TMP, sp, #16
str TMP, Caml_state(gc_regs)

/* Save current allocation pointer for debugging purposes */
str ALLOC_PTR, Caml_state(young_ptr)

/* Call the realloc function */
bl G(caml_local_realloc)

/* Restore registers */
ldp x0, x1, [sp, 16]
ldp x2, x3, [sp, 32]
ldp x4, x5, [sp, 48]
ldp x6, x7, [sp, 64]
ldp x8, x9, [sp, 80]
ldp x10, x11, [sp, 96]
ldp x12, x13, [sp, 112]
ldp x14, x15, [sp, 128]
ldp x19, x20, [sp, 144]
ldp x21, x22, [sp, 160]
ldp x23, x24, [sp, 176]
ldr x25, [sp, 192]
ldp d0, d1, [sp, 208]
ldp d2, d3, [sp, 224]
ldp d4, d5, [sp, 240]
ldp d6, d7, [sp, 256]
ldp d16, d17, [sp, 272]
ldp d18, d19, [sp, 288]
ldp d20, d21, [sp, 304]
ldp d22, d23, [sp, 320]
ldp d24, d25, [sp, 336]
ldp d26, d27, [sp, 352]
ldp d28, d29, [sp, 368]
ldp d30, d31, [sp, 384]

/* Reload new allocation pointer */
ldr ALLOC_PTR, Caml_state(young_ptr)

/* Free stack space and return to caller */
ldp x29, x30, [sp], 400
ret
CFI_ENDPROC
END_FUNCTION(caml_call_local_realloc)

/* Call a C function from OCaml */
/* Function to call is in ADDITIONAL_ARG */

Expand Down

0 comments on commit 7c4a510

Please sign in to comment.