Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SIMD register destruction #2311

Merged
merged 3 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/amd64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,7 @@ let emit_atomic instr op (size : Cmm.atomic_bitwidth) addr =
I.movzx res8 res

let emit_simd_instr op i =
(match Simd_selection.register_behavior op with
(match Simd_proc.register_behavior op with
| R_to_fst ->
assert (arg i 0 = res i 0);
assert (Reg.is_reg i.arg.(0))
Expand Down
48 changes: 40 additions & 8 deletions backend/amd64/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -170,15 +170,16 @@ let phys_reg ty n =

let rax = phys_reg Int 0
let rdx = phys_reg Int 4
let rcx = phys_reg Int 5
let r10 = phys_reg Int 10
let r11 = phys_reg Int 11
let rbp = phys_reg Int 12

(* CSE needs to know that all versions of xmm15 are destroyed. *)
let destroy_xmm15 () =
let destroy_xmm n =
if Language_extension.is_enabled SIMD
then [| phys_reg Float 115; phys_reg Vec128 115 |]
else [| phys_reg Float 115 |]
then [| phys_reg Float (100 + n); phys_reg Vec128 (100 + n) |]
else [| phys_reg Float (100 + n) |]

let destroyed_by_plt_stub =
if not X86_proc.use_plt then [| |] else [| r10; r11 |]
Expand Down Expand Up @@ -399,6 +400,21 @@ let destroyed_at_pushtrap =
let has_pushtrap traps =
List.exists (function Cmm.Push _ -> true | Pop _ -> false) traps

let destroyed_by_simd_op op =
match Simd_proc.register_behavior op with
| R_RM_rax_rdx_to_xmm0
| R_RM_to_xmm0 -> destroy_xmm 0
| R_RM_rax_rdx_to_rcx
| R_RM_to_rcx -> [| rcx |]
| R_to_fst
| R_to_R
| R_to_RM
| RM_to_R
| R_R_to_fst
| R_RM_to_fst
| R_RM_to_R
| R_RM_xmm0_to_fst -> [||]

(* note: keep this function in sync with `destroyed_at_{basic,terminator}` below. *)
let destroyed_at_oper = function
Iop(Icall_ind | Icall_imm _) ->
Expand All @@ -410,7 +426,7 @@ let destroyed_at_oper = function
| Iop(Iintop(Idiv | Imod)) | Iop(Iintop_imm((Idiv | Imod), _))
-> [| rax; rdx |]
| Iop(Istore(Single, _, _))
-> destroy_xmm15 ()
-> destroy_xmm 15
| Iop(Ialloc _ | Ipoll _) -> destroyed_at_alloc_or_poll
| Iop(Iintop(Imulh _ | Icomp _) | Iintop_imm((Icomp _), _))
-> [| rax |]
Expand All @@ -420,9 +436,10 @@ let destroyed_at_oper = function
| Ireturn traps when has_pushtrap traps -> assert false
| Iop(Ispecific (Irdtsc | Irdpmc)) -> [| rax; rdx |]
| Iop(Ispecific(Ilfence | Isfence | Imfence)) -> [||]
| Iop(Ispecific(Isimd op)) -> destroyed_by_simd_op op
| Iop(Ispecific(Isextend32 | Izextend32 | Ilea _
| Istore_int (_, _, _) | Ioffset_loc (_, _)
| Ipause | Iprefetch _ | Isimd _
| Ipause | Iprefetch _
| Ifloatarithmem (_, _) | Ifloatsqrtf _ | Ibswap _))
| Iop(Iintop(Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
| Ipopcnt | Iclz _ | Ictz _ ))
Expand Down Expand Up @@ -465,14 +482,15 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
| Op (Intop (Idiv | Imod)) | Op (Intop_imm ((Idiv | Imod), _)) ->
[| rax; rdx |]
| Op(Store(Single, _, _)) ->
destroy_xmm15 ()
destroy_xmm 15
| Op(Intop(Imulh _ | Icomp _) | Intop_imm((Icomp _), _)) ->
[| rax |]
| Op (Specific (Irdtsc | Irdpmc)) ->
[| rax; rdx |]
| Op Poll -> destroyed_at_alloc_or_poll
| Op (Alloc _) ->
destroyed_at_alloc_or_poll
| Op (Specific (Isimd op)) -> destroyed_by_simd_op op
| Op (Move | Spill | Reload
| Const_int _ | Const_float _ | Const_symbol _ | Const_vec128 _
| Stackoffset _
Expand All @@ -497,7 +515,7 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
| Begin_region
| End_region
| Specific (Ilea _ | Istore_int _ | Ioffset_loc _
| Ifloatarithmem _ | Ifloatsqrtf _ | Ibswap _ | Isimd _
| Ifloatarithmem _ | Ifloatsqrtf _ | Ibswap _
| Isextend32 | Izextend32 | Ipause
| Iprefetch _ | Ilfence | Isfence | Imfence)
| Name_for_debugger _ | Dls_get)
Expand Down Expand Up @@ -596,6 +614,20 @@ let max_register_pressure =
consumes ~int:1 ~float:0
| Istore(Single, _, _) | Icompf _ ->
consumes ~int:0 ~float:1
| Ispecific(Isimd op) ->
(match Simd_proc.register_behavior op with
| R_RM_rax_rdx_to_xmm0
| R_RM_to_xmm0 -> consumes ~int:0 ~float:1
| R_RM_rax_rdx_to_rcx
| R_RM_to_rcx -> consumes ~int:1 ~float:0
| R_to_fst
| R_to_R
| R_to_RM
| RM_to_R
| R_R_to_fst
| R_RM_to_fst
| R_RM_to_R
| R_RM_xmm0_to_fst -> consumes ~int:0 ~float:0)
| Iintop(Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
| Ipopcnt|Iclz _| Ictz _)
| Iintop_imm((Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr
Expand All @@ -613,7 +645,7 @@ let max_register_pressure =
| Istackoffset _ | Iload _
| Ispecific(Ilea _ | Isextend32 | Izextend32 | Iprefetch _ | Ipause
| Irdtsc | Irdpmc | Istore_int (_, _, _)
| Ilfence | Isfence | Imfence | Isimd _
| Ilfence | Isfence | Imfence
| Ioffset_loc (_, _) | Ifloatarithmem (_, _) | Ifloatsqrtf _
| Ibswap _)
| Iname_for_debugger _ | Iprobe _ | Iprobe_is_enabled _ | Iopaque
Expand Down
2 changes: 1 addition & 1 deletion backend/amd64/regalloc_stack_operands.ml
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
| Op (Addf | Subf | Mulf | Divf) ->
may_use_stack_operand_for_second_argument map instr ~num_args:2 ~res_is_fst:true
| Op (Specific (Isimd op)) ->
(match Simd_selection.register_behavior op with
(match Simd_proc.register_behavior op with
| R_to_fst | R_to_R | R_R_to_fst -> May_still_have_spilled_registers
| R_RM_to_fst ->
may_use_stack_operand_for_second_argument map instr ~num_args:2 ~res_is_fst:true
Expand Down
2 changes: 1 addition & 1 deletion backend/amd64/simd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
(* *)
(**************************************************************************)

[@@@ocaml.warning "+a-4-30-40-41-42"]
[@@@ocaml.warning "+a-40-42"]

(* SIMD instructions for AMD64 *)

Expand Down
121 changes: 121 additions & 0 deletions backend/amd64/simd_proc.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
(**************************************************************************)
(* *)
(* OCaml *)
(* *)
(* Max Slater, Jane Street *)
(* *)
(* Copyright 2024 Jane Street Group LLC *)
(* *)
(* All rights reserved. This file is distributed under the terms of *)
(* the GNU Lesser General Public License version 2.1, with the *)
(* special exception on linking described in the file LICENSE. *)
(* *)
(**************************************************************************)

[@@@ocaml.warning "+a-40-42"]

(* SIMD register behavior for AMD64 *)

open Simd

(* This will need to be expanded with the addition of three and four argument
operations in AVX2 and AVX512. *)
type register_behavior =
| R_to_fst
| R_to_R
| R_to_RM
| RM_to_R
| R_R_to_fst
| R_RM_to_fst
| R_RM_to_R
| R_RM_xmm0_to_fst
| R_RM_rax_rdx_to_rcx
| R_RM_to_rcx
| R_RM_rax_rdx_to_xmm0
| R_RM_to_xmm0

let register_behavior_clmul = function Clmul_64 _ -> R_RM_to_fst

let register_behavior_bmi2 = function Extract_64 | Deposit_64 -> R_RM_to_R

let register_behavior_sse = function
| Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
| Interleave_low_32 | Interleave_high_32 | Shuffle_32 _ ->
R_RM_to_fst
| Rcp_f32 | Sqrt_f32 | Rsqrt_f32 -> RM_to_R
| High_64_to_low_64 | Low_64_to_high_64 -> R_R_to_fst
| Movemask_32 -> R_to_R

let register_behavior_sse2 = function
| Add_i8 | Add_i16 | Add_i32 | Add_i64 | Add_f64 | Add_saturating_i8
| Min_scalar_f64 | Max_scalar_f64 | Add_saturating_i16
| Add_saturating_unsigned_i8 | Add_saturating_unsigned_i16 | Sub_i8 | Sub_i16
| Sub_i32 | Sub_i64 | Sub_f64 | Sub_saturating_i8 | Sub_saturating_i16
| Sub_saturating_unsigned_i8 | Sub_saturating_unsigned_i16 | Max_unsigned_i8
| Max_i16 | Max_f64 | Min_unsigned_i8 | Min_i16 | Min_f64 | Mul_f64 | Div_f64
| And_bits | Andnot_bits | Or_bits | Xor_bits | Cmpeq_i8 | Cmpeq_i16
| Cmpeq_i32 | Cmpgt_i8 | Cmpgt_i16 | Cmpgt_i32 | Cmp_f64 _ | SLL_i16 | SLL_i32
| SLL_i64 | SRL_i16 | SRL_i32 | SRL_i64 | SRA_i16 | SRA_i32 | Avg_unsigned_i8
| Avg_unsigned_i16 | SAD_unsigned_i8 | Shuffle_64 _ | Interleave_high_8
| Interleave_high_16 | Interleave_high_64 | Interleave_low_8
| Interleave_low_16 | Interleave_low_64 | I16_to_i8 | I32_to_i16
| I16_to_unsigned_i8 | I32_to_unsigned_i16 | Mulhi_i16 | Mulhi_unsigned_i16
| Mullo_i16 | Mul_hadd_i16_to_i32 ->
R_RM_to_fst
| Shuffle_high_16 _ | Shuffle_low_16 _ | I32_to_f64 | I32_to_f32 | F64_to_i32
| Cast_scalar_f64_i64 | F64_to_f32 | F32_to_i32 | F32_to_f64 | Sqrt_f64 ->
RM_to_R
| SLLi_i16 _ | SLLi_i32 _ | SLLi_i64 _ | SRLi_i16 _ | SRLi_i32 _ | SRLi_i64 _
| SRAi_i16 _ | SRAi_i32 _ | Shift_left_bytes _ | Shift_right_bytes _ ->
R_to_fst
| Movemask_8 | Movemask_64 -> R_to_R
| Sqrt_scalar_f64 -> (* Backwards compatibility *) R_to_R

let register_behavior_sse3 = function
| Addsub_f32 | Addsub_f64 | Hadd_f32 | Hadd_f64 | Hsub_f32 | Hsub_f64 ->
R_RM_to_fst
| Dup_low_64 | Dup_odd_32 | Dup_even_32 -> RM_to_R

let register_behavior_ssse3 = function
| Hadd_i16 | Hadd_i32 | Hadd_saturating_i16 | Hsub_i16 | Hsub_i32
| Hsub_saturating_i16 | Mulsign_i8 | Mulsign_i16 | Mulsign_i32 | Shuffle_8
| Alignr_i8 _ | Mul_unsigned_hadd_saturating_i8_to_i16 ->
R_RM_to_fst
| Abs_i8 | Abs_i16 | Abs_i32 -> RM_to_R

let register_behavior_sse41 = function
| Blend_16 _ | Blend_32 _ | Blend_64 _ | Cmpeq_i64 | Dp_f32 _ | Dp_f64 _
| Max_i8 | Max_i32 | Max_unsigned_i16 | Max_unsigned_i32 | Min_i8 | Min_i32
| Min_unsigned_i16 | Min_unsigned_i32 | Insert_i8 _ | Insert_i16 _
| Insert_i32 _ | Insert_i64 _ | Multi_sad_unsigned_i8 _ | Mullo_i32 ->
R_RM_to_fst
| I8_sx_i16 | I8_sx_i32 | I8_sx_i64 | I16_sx_i32 | I16_sx_i64 | I32_sx_i64
| I8_zx_i16 | I8_zx_i32 | I8_zx_i64 | I16_zx_i32 | I16_zx_i64 | I32_zx_i64
| Round_f64 _ | Round_f32 _ | Minpos_unsigned_i16 | Round_scalar_f64 _ ->
RM_to_R
| Blendv_8 | Blendv_32 | Blendv_64 -> R_RM_xmm0_to_fst
| Extract_i64 _ | Extract_i32 _ -> R_to_RM
| Extract_i8 _ | Extract_i16 _ ->
(* CR mslater: (SIMD): replace once we have int8/int16/float32 *)
R_to_R

let register_behavior_sse42 = function
| Crc32_64 | Cmpgt_i64 -> R_RM_to_fst
| Cmpestrm _ -> R_RM_rax_rdx_to_xmm0
| Cmpistrm _ -> R_RM_to_xmm0
| Cmpestra _ | Cmpestrc _ | Cmpestri _ | Cmpestro _ | Cmpestrs _ | Cmpestrz _
->
R_RM_rax_rdx_to_rcx
| Cmpistra _ | Cmpistrc _ | Cmpistri _ | Cmpistro _ | Cmpistrs _ | Cmpistrz _
->
R_RM_to_rcx

let register_behavior = function
| CLMUL op -> register_behavior_clmul op
| BMI2 op -> register_behavior_bmi2 op
| SSE op -> register_behavior_sse op
| SSE2 op -> register_behavior_sse2 op
| SSE3 op -> register_behavior_sse3 op
| SSSE3 op -> register_behavior_ssse3 op
| SSE41 op -> register_behavior_sse41 op
| SSE42 op -> register_behavior_sse42 op
4 changes: 2 additions & 2 deletions backend/amd64/simd_reload.ml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
(* *)
(**************************************************************************)

[@@@ocaml.warning "+a-4-30-40-41-42"]
[@@@ocaml.warning "+a-40-42"]

(* SIMD instruction reload for AMD64 *)

let reload_operation makereg op arg res =
let stackp r =
match r.Reg.loc with Stack _ -> true | Reg _ | Unknown -> false
in
match Simd_selection.register_behavior op with
match Simd_proc.register_behavior op with
| R_to_fst ->
(* Argument must be in a register; result must be the argument. *)
let arg0 = if stackp arg.(0) then makereg arg.(0) else arg.(0) in
Expand Down
Loading
Loading