Skip to content

128-bit Array Load/Store #1682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 77 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from 57 commits
Commits
Show all changes
77 commits
Select commit Hold shift + click to select a range
bf8af43
squash for review
TheNumbat Jul 28, 2023
a8994c2
restore consts test
TheNumbat Jul 28, 2023
a4b2c11
squash for review
TheNumbat Jul 28, 2023
72e32cc
masm fix
TheNumbat Jul 28, 2023
41dcda3
Merge branch 'simd-intrins' into simd-intrins2
TheNumbat Jul 28, 2023
06ff1cc
merge
TheNumbat Jul 28, 2023
a906a01
Merge branch 'simd-intrins' into simd-intrins2
TheNumbat Jul 28, 2023
c115d80
int64x2 tests
TheNumbat Jul 28, 2023
8129ee7
int32x4 tests
TheNumbat Jul 28, 2023
377662c
most int16x8 tests
TheNumbat Jul 28, 2023
b832bc4
finish int16x8 tests
TheNumbat Jul 31, 2023
248a6e5
int8x16 tests
TheNumbat Jul 31, 2023
0e2111a
utility tests
TheNumbat Jul 31, 2023
b90063b
string instrs + tests
TheNumbat Jul 31, 2023
cfbe737
add + test exotic ops
TheNumbat Jul 31, 2023
f65e53e
Update simd.ml
TheNumbat Jul 31, 2023
0259b48
address comments
TheNumbat Aug 1, 2023
3de0702
merge
TheNumbat Aug 1, 2023
7f8cf23
remove arg duping
TheNumbat Aug 1, 2023
8503711
add warnings
TheNumbat Aug 1, 2023
c9ddc5c
more warnings
TheNumbat Aug 1, 2023
b1b6c29
merge
TheNumbat Aug 1, 2023
fa668e9
allow f64 cast either stack op
TheNumbat Aug 1, 2023
99da902
add real unop case
TheNumbat Aug 1, 2023
2878f5f
Merge branch 'simd-intrins' into simd-intrins2
TheNumbat Aug 1, 2023
149bb72
Update backend/amd64/regalloc_stack_operands.ml
TheNumbat Aug 2, 2023
18a16e7
Merge branch 'simd-intrins' into simd-intrins2
TheNumbat Aug 2, 2023
f7a0bf6
Merge branch 'main' into simd-intrins
TheNumbat Aug 10, 2023
dcc61fb
Merge branch 'simd-intrins' into simd-intrins2
TheNumbat Aug 10, 2023
f01e32b
squash
TheNumbat Aug 11, 2023
00d5ca4
Merge branch 'main' into simd-intrins
TheNumbat Aug 14, 2023
d958cf0
Merge branch 'simd-intrins' into simd-intrins2
TheNumbat Aug 14, 2023
136e5e8
merge
TheNumbat Aug 14, 2023
2655421
error on string/bytes safe aligned case (not exposed)
TheNumbat Aug 14, 2023
be5a927
print
TheNumbat Aug 14, 2023
f9d8b00
fix dbg info order?
TheNumbat Aug 14, 2023
63930e7
remove unaligned access on string/bytes; assume ba is 16b aligned fro…
TheNumbat Aug 18, 2023
f509ee4
Merge branch 'main' into simd-intrins
TheNumbat Aug 22, 2023
c256097
Update middle_end/flambda2/from_lambda/lambda_to_flambda_primitives.ml
TheNumbat Aug 22, 2023
3f7cfe7
address comments
TheNumbat Aug 22, 2023
a0c197f
Merge branch 'simd-array-ops' of https://github.com/ocaml-flambda/fla…
TheNumbat Aug 22, 2023
5d4febc
Merge branch 'simd-intrins' into simd-intrins2
TheNumbat Aug 22, 2023
ecdc256
merge
TheNumbat Aug 22, 2023
6a237e2
add simd class_of_operation
TheNumbat Aug 23, 2023
659a44d
classes
TheNumbat Aug 23, 2023
1baf8ae
format
TheNumbat Aug 23, 2023
208e1f8
merge
TheNumbat Aug 23, 2023
2bdb8c0
merge
TheNumbat Aug 23, 2023
83a1bc7
Merge branch 'simd-intrins2' into simd-array-ops
TheNumbat Aug 23, 2023
751953e
random whitespace?
TheNumbat Aug 23, 2023
c4ac43c
improve tests
TheNumbat Aug 24, 2023
f61f493
closure check bound before align
TheNumbat Aug 24, 2023
4b011ab
Update backend/amd64/simd.ml
TheNumbat Aug 24, 2023
5924bcc
Merge branch 'simd-intrins' into simd-intrins2
TheNumbat Aug 24, 2023
74a4a5a
Merge branch 'simd-intrins2' into simd-array-ops
TheNumbat Aug 24, 2023
b951c38
merge
TheNumbat Aug 25, 2023
3c4ae41
Merge branch 'simd-intrins2' into simd-array-ops
TheNumbat Aug 25, 2023
298a8ac
address comments
TheNumbat Oct 9, 2023
1cb257e
address comments
TheNumbat Oct 9, 2023
ec8022d
address comments
TheNumbat Oct 9, 2023
9c464be
address comments
TheNumbat Oct 9, 2023
cb068c7
address comments
TheNumbat Oct 9, 2023
ed95a6c
Merge branch 'main' into simd-intrins2
TheNumbat Oct 9, 2023
29c8d36
merge
TheNumbat Oct 9, 2023
2c71f7b
use noexc for simd rounding codes
TheNumbat Oct 9, 2023
8ea31d5
Merge branch 'simd-intrins2' into simd-array-ops
TheNumbat Oct 9, 2023
18270cd
alignment comments
TheNumbat Oct 10, 2023
fa64161
Apply suggestions from code review
TheNumbat Oct 10, 2023
6e0fd67
simd_reload.ml
TheNumbat Oct 10, 2023
382aa46
Merge branch 'simd-intrins2' into simd-array-ops
TheNumbat Oct 10, 2023
55299d3
address comments
TheNumbat Oct 10, 2023
2e01e07
address comments
TheNumbat Oct 10, 2023
5fcef97
restore res/arg check
TheNumbat Oct 11, 2023
adba9cb
Merge branch 'simd-intrins2' of https://github.com/ocaml-flambda/flam…
TheNumbat Oct 11, 2023
72be21f
Merge branch 'simd-intrins2' into simd-array-ops
TheNumbat Oct 11, 2023
1654808
Merge branch 'main' into simd-array-ops
TheNumbat Oct 12, 2023
ecfac2c
label args
TheNumbat Oct 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backend/CSEgen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -240,9 +240,9 @@ method class_of_operation op =
| Iload(_,_,mut) -> Op_load mut
| Istore(_,_,asg) -> Op_store asg
| Ialloc _ | Ipoll _ -> assert false (* treated specially *)
| Iintop(Icheckbound) -> Op_checkbound
| Iintop(Icheckbound|Icheckalign _) -> Op_checkbound
| Iintop _ -> Op_pure
| Iintop_imm(Icheckbound, _) -> Op_checkbound
| Iintop_imm((Icheckbound|Icheckalign _), _) -> Op_checkbound
| Iintop_imm(_, _) -> Op_pure
| Iintop_atomic _ -> Op_store true
| Icompf _
Expand Down
317 changes: 278 additions & 39 deletions backend/amd64/emit.mlp

Large diffs are not rendered by default.

24 changes: 13 additions & 11 deletions backend/amd64/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -436,14 +436,14 @@ let destroyed_at_oper = function
| Ifloat_iround | Ifloat_min | Ifloat_max
| Ifloatarithmem (_, _) | Ibswap _ | Ifloatsqrtf _))
| Iop(Iintop(Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
| Ipopcnt | Iclz _ | Ictz _ | Icheckbound))
| Ipopcnt | Iclz _ | Ictz _ | Icheckbound | Icheckalign _))
| Iop(Iintop_imm((Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl
| Ilsr | Iasr | Ipopcnt | Iclz _ | Ictz _
| Icheckbound),_))
| Icheckbound | Icheckalign _),_))
| Iop(Iintop_atomic _)
| Iop(Istore((Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
| Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Word_val
| Double | Onetwentyeight ), _, _))
| Double | Onetwentyeight_aligned | Onetwentyeight_unaligned), _, _))
| Iop(Imove | Ispill | Ireload | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
| Icompf _
| Icsel _
Expand Down Expand Up @@ -476,21 +476,22 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
destroyed_at_pushtrap
| Op (Intop (Idiv | Imod)) | Op (Intop_imm ((Idiv | Imod), _)) ->
[| rax; rdx |]
| Op(Store(Single, _, _)) ->
| Op (Store(Single, _, _)) ->
destroy_xmm15 ()
| Op(Intop(Imulh _ | Icomp _) | Intop_imm((Icomp _), _)) ->
[| rax |]
| Op (Specific (Irdtsc | Irdpmc)) ->
[| rax; rdx |]
| Op (Intop Icheckbound | Intop_imm (Icheckbound, _)) ->
| Op (Intop (Icheckbound | Icheckalign _)
| Intop_imm ((Icheckbound | Icheckalign _), _)) ->
assert false
| Op (Move | Spill | Reload
| Const_int _ | Const_float _ | Const_symbol _ | Const_vec128 _
| Stackoffset _
| Load _ | Store ((Byte_unsigned | Byte_signed | Sixteen_unsigned
| Sixteen_signed | Thirtytwo_unsigned
| Thirtytwo_signed | Word_int | Word_val
| Double | Onetwentyeight ), _, _)
| Double | Onetwentyeight_aligned | Onetwentyeight_unaligned), _, _)
| Intop (Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr
| Iasr | Ipopcnt | Iclz _ | Ictz _)
| Intop_imm ((Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor
Expand Down Expand Up @@ -526,7 +527,7 @@ let destroyed_at_terminator (terminator : Cfg_intf.S.terminator) =
destroyed_at_alloc_or_poll
| Always _ | Parity_test _ | Truth_test _ | Float_test _ | Int_test _
| Return | Raise _ | Tailcall_self _ | Tailcall_func _
| Prim {op = Checkbound _ | Probe _; _}
| Prim {op = Checkbound _ | Checkalign _ | Probe _; _}
->
if fp then [| rbp |] else [||]
| Switch _ ->
Expand Down Expand Up @@ -559,7 +560,7 @@ let is_destruction_point (terminator : Cfg_intf.S.terminator) =
false
| Always _ | Parity_test _ | Truth_test _ | Float_test _ | Int_test _
| Return | Raise _ | Tailcall_self _ | Tailcall_func _
| Prim {op = Checkbound _ | Probe _; _} ->
| Prim {op = (Checkbound _ | Checkalign _) | Probe _; _} ->
false
| Switch _ ->
false
Expand Down Expand Up @@ -616,13 +617,13 @@ let max_register_pressure =
| Istore(Single, _, _) | Icompf _ ->
consumes ~int:0 ~float:1
| Iintop(Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
| Ipopcnt|Iclz _| Ictz _|Icheckbound)
| Ipopcnt|Iclz _| Ictz _|Icheckbound|Icheckalign _)
| Iintop_imm((Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr
| Iasr | Ipopcnt | Iclz _| Ictz _|Icheckbound), _)
| Iasr | Ipopcnt | Iclz _| Ictz _|Icheckbound|Icheckalign _), _)
| Iintop_atomic _
| Istore((Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
| Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Word_val
| Double | Onetwentyeight ),
| Double | Onetwentyeight_aligned | Onetwentyeight_unaligned),
_, _)
| Imove | Ispill | Ireload | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
| Icsel _
Expand Down Expand Up @@ -729,6 +730,7 @@ let operation_supported = function
| Ccmpf _
| Craise _
| Ccheckbound
| Ccheckalign _
| Cvectorcast _ | Cscalarcast _
| Cprobe _ | Cprobe_is_enabled _ | Copaque | Cbeginregion | Cendregion
-> true
Expand Down
24 changes: 17 additions & 7 deletions backend/amd64/regalloc_stack_operands.ml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ open! Regalloc_utils
let debug = false

let may_use_stack_operand_for_second_argument
: type a . spilled_map -> a Cfg.instruction -> stack_operands_rewrite
= fun map instr ->
: type a . ?num_args:int -> spilled_map -> a Cfg.instruction -> stack_operands_rewrite
= fun ?(num_args = 2) map instr ->
if debug then begin
check_lengths instr ~of_arg:2 ~of_res:1;
check_lengths instr ~of_arg:num_args ~of_res:1;
check_same "res(0)" instr.res.(0) "arg(0)" instr.arg.(0);
end;
begin match is_spilled map instr.arg.(1) with
Expand Down Expand Up @@ -181,8 +181,13 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
may_use_stack_operand_for_second_argument map instr
| Op (Specific (Isimd op)) ->
(match Simd_selection.register_behavior op with
| R_to_R | R_R_to_fst -> May_still_have_spilled_registers
| R_RM_to_fst -> may_use_stack_operand_for_second_argument map instr
| R_to_fst | R_to_R | R_R_to_fst -> May_still_have_spilled_registers
| R_RM_to_fst | String_no_length | String_no_length_mask ->
may_use_stack_operand_for_second_argument map instr
| String_length | String_length_mask ->
may_use_stack_operand_for_second_argument ~num_args:4 map instr
| R_RM_XMM0_to_fst -> may_use_stack_operand_for_second_argument ~num_args:3 map instr
| R_to_RM -> may_use_stack_operand_for_result map instr ~num_args:1
| RM_to_R -> may_use_stack_operand_for_only_argument map instr ~has_result:true)
| Op (Scalarcast (V128_to_scalar (Float64x2) | V128_of_scalar (Float64x2))) ->
unary_operation_argument_or_result_on_stack map instr
Expand Down Expand Up @@ -246,8 +251,8 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
| Prologue ->
(* no rewrite *)
May_still_have_spilled_registers
| Op (Intop Icheckbound)
| Op (Intop_imm ((Ipopcnt | Iclz _ | Ictz _ | Icheckbound), _)) ->
| Op (Intop (Icheckbound | Icheckalign _))
| Op (Intop_imm ((Ipopcnt | Iclz _ | Ictz _ | Icheckbound | Icheckalign _), _)) ->
(* should not happen *)
fatal "unexpected instruction"
end
Expand All @@ -259,11 +264,16 @@ let terminator (map : spilled_map) (term : Cfg.terminator Cfg.instruction) =
| Int_test { lt = _; eq = _; gt =_; is_signed = _; imm = None; }
| Prim {op = Checkbound { immediate = None; }; _} ->
binary_operation map term No_result
| Prim {op = Checkalign { immediate = None; _ }; _} ->
may_use_stack_operand_for_only_argument ~has_result:false map term
| Int_test { lt = _; eq = _; gt =_; is_signed = _; imm = Some _; }
| Parity_test { ifso = _; ifnot = _; }
| Truth_test { ifso = _; ifnot = _; }
| Prim {op = Checkbound { immediate = Some _; }; _} ->
may_use_stack_operand_for_only_argument ~has_result:false map term
| Prim {op = Checkalign { immediate = Some _; _ }; _} ->
if debug then check_lengths term ~of_arg:0 ~of_res:0;
All_spilled_registers_rewritten
| Float_test _ ->
(* CR-someday xclerc for xclerc: this could be optimized, but the representation
makes it more difficult than the cases above, because (i) multiple
Expand Down
25 changes: 21 additions & 4 deletions backend/amd64/reload.ml
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,9 @@ method! reload_operation op arg res =
if stackp res.(0) then [| self#makereg res.(0) |] else res
in
arg, res
| Iintop(Imulh _ | Idiv | Imod | Ilsl | Ilsr | Iasr)
| Iintop(Imulh _ | Idiv | Imod | Ilsl | Ilsr | Iasr | Icheckalign _)
| Iintop_imm((Iadd | Isub | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
| Imulh _ | Idiv | Imod | Icheckbound), _) ->
| Imulh _ | Idiv | Imod | Icheckbound | Icheckalign _), _) ->
(* The argument(s) and results can be either in register or on stack *)
(* Note: Imulh, Idiv, Imod: arg(0) and res(0) already forced in regs
Ilsl, Ilsr, Iasr: arg(1) already forced in regs *)
Expand All @@ -134,6 +134,14 @@ method! reload_operation op arg res =
else (arg, res)
| Ispecific(Isimd op) ->
(match Simd_selection.register_behavior op with
| R_to_fst ->
(* Argument must be in a register; result must be the argument. *)
let arg0 = if stackp arg.(0) then self#makereg arg.(0) else arg.(0) in
([|arg0|], [|arg0|])
| R_to_RM ->
(* Argument must be in a register. *)
let arg0 = if stackp arg.(0) then self#makereg arg.(0) else arg.(0) in
([|arg0|], res)
| RM_to_R ->
(* Result must be in a register. *)
let res0 = if stackp res.(0) then self#makereg res.(0) else res.(0) in
Expand All @@ -148,11 +156,20 @@ method! reload_operation op arg res =
let arg0 = if stackp arg.(0) then self#makereg arg.(0) else arg.(0) in
let arg1 = if stackp arg.(1) then self#makereg arg.(1) else arg.(1) in
([|arg0; arg1|], [|arg0|])
| R_RM_to_fst ->
| R_RM_to_fst | R_RM_XMM0_to_fst ->
(* First argument must be a register; the result must be the first arg.
Note that stack-spilled vectors are properly aligned. *)
let arg0 = if stackp arg.(0) then self#makereg arg.(0) else arg.(0) in
([|arg0; arg.(1)|], [|arg0|]))
let arg = Array.copy arg in
Array.set arg 0 arg0;
(arg, [|arg0|])
| String_length | String_no_length | String_length_mask | String_no_length_mask ->
(* First argument must be a register. Specific register constraints
are enforced by selection. *)
let arg0 = if stackp arg.(0) then self#makereg arg.(0) else arg.(0) in
let arg = Array.copy arg in
Array.set arg 0 arg0;
(arg, res))
| Ifloatofint | Iintoffloat ->
(* Result must be in register, but argument can be on stack *)
(arg, (if stackp res.(0) then [| self#makereg res.(0) |] else res))
Expand Down
22 changes: 18 additions & 4 deletions backend/amd64/selection.ml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ exception Use_default
let rax = phys_reg Int 0
let rcx = phys_reg Int 5
let rdx = phys_reg Int 4
let xmm0v () = phys_reg Vec128 100

let pseudoregs_for_operation op arg res =
match op with
Expand Down Expand Up @@ -156,19 +157,32 @@ let pseudoregs_for_operation op arg res =
([|res.(0); arg.(1)|], res)
| Ispecific (Isimd op) ->
(match Simd_selection.register_behavior op with
| RM_to_R | R_to_R -> (arg, res)
| R_to_R | RM_to_R | R_to_RM -> (arg, res)
| R_to_fst ->
(* arg.(0) and res.(0) must be the same *)
([|res.(0)|], res)
| R_R_to_fst | R_RM_to_fst ->
(* arg.(0) and res.(0) must be the same *)
([|res.(0); arg.(1)|], res))
([|res.(0); arg.(1)|], res)
| R_RM_XMM0_to_fst ->
([|res.(0); arg.(1); xmm0v ()|], res)
| String_length ->
([|arg.(0); arg.(1); rax; rdx|], [| rcx |])
| String_length_mask ->
([|arg.(0); arg.(1); rax; rdx|], [| xmm0v () |])
| String_no_length ->
(arg, [| rcx |])
| String_no_length_mask ->
(arg, [| xmm0v () |]))
| Icsel _ ->
(* last arg must be the same as res.(0) *)
let len = Array.length arg in
let arg = Array.copy arg in
arg.(len-1) <- res.(0);
(arg, res)
(* Other instructions are regular *)
| Iintop (Ipopcnt|Iclz _|Ictz _|Icomp _|Icheckbound)
| Iintop_imm ((Imulh _|Idiv|Imod|Icomp _|Icheckbound
| Iintop (Ipopcnt|Iclz _|Ictz _|Icomp _|Icheckbound|Icheckalign _)
| Iintop_imm ((Imulh _|Idiv|Imod|Icomp _|Icheckbound|Icheckalign _
|Ipopcnt|Iclz _|Ictz _), _)
| Ispecific (Isqrtf|Isextend32|Izextend32|Ilea _|Istore_int (_, _, _)
|Ifloat_iround|Ifloat_round _
Expand Down
Loading