Skip to content

Commit ba15ee5

Browse files
authored
Vectorize [Ifloatarithmem] (#3452)
* Add [Isimd_mem] to [Arch.Specific] and emit [addpd] with memory arg and similar instructions * Vectorize [Ifloatarithmem] When the memory alignment is known to be 128-bit (currently, never) emits [addpd], otherwise emits a vector load followed by an arithmetic instruction.
1 parent 9755b39 commit ba15ee5

14 files changed

+305
-47
lines changed

backend/amd64/CSE.ml

+12-6
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ open Arch
2020
open Mach
2121
open CSE_utils
2222

23+
let of_simd_class (cl : Simd.operation_class) =
24+
match cl with
25+
| Pure -> Op_pure
26+
| Load { is_mutable = true } -> Op_load Mutable
27+
| Load { is_mutable = false } -> Op_load Immutable
28+
2329
class cse = object
2430

2531
inherit CSEgen.cse_generic as super
@@ -36,9 +42,9 @@ method! class_of_operation op =
3642
| Irdtsc | Irdpmc
3743
| Ilfence | Isfence | Imfence -> Op_other
3844
| Isimd op ->
39-
begin match Simd.class_of_operation op with
40-
| Pure -> Op_pure
41-
end
45+
of_simd_class (Simd.class_of_operation op)
46+
| Isimd_mem (op,_addr) ->
47+
of_simd_class (Simd.Mem.class_of_operation op)
4248
| Ipause
4349
| Icldemote _
4450
| Iprefetch _ -> Op_other
@@ -80,9 +86,9 @@ class cfg_cse = object
8086
| Irdtsc | Irdpmc
8187
| Ilfence | Isfence | Imfence -> Op_other
8288
| Isimd op ->
83-
begin match Simd.class_of_operation op with
84-
| Pure -> Op_pure
85-
end
89+
of_simd_class (Simd.class_of_operation op)
90+
| Isimd_mem (op,_addr) ->
91+
of_simd_class (Simd.Mem.class_of_operation op)
8692
| Ipause
8793
| Icldemote _
8894
| Iprefetch _ -> Op_other

backend/amd64/arch.ml

+14-4
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@ type specific_operation =
152152
| Imfence (* memory fence *)
153153
| Ipause (* hint for spin-wait loops *)
154154
| Isimd of Simd.operation (* SIMD instruction set operations *)
155+
| Isimd_mem of Simd.Mem.operation * addressing_mode
156+
(* SIMD instruction set operations
157+
with memory args *)
155158
| Icldemote of addressing_mode (* hint to demote a cacheline to L3 *)
156159
| Iprefetch of (* memory prefetching hint *)
157160
{ is_write: bool;
@@ -272,6 +275,8 @@ let print_specific_operation printreg op ppf arg =
272275
fprintf ppf "rdpmc %a" printreg arg.(0)
273276
| Isimd simd ->
274277
Simd.print_operation printreg simd ppf arg
278+
| Isimd_mem (simd, addr) ->
279+
Simd.Mem.print_operation printreg (print_addressing printreg addr) simd ppf arg
275280
| Ipause ->
276281
fprintf ppf "pause"
277282
| Icldemote _ ->
@@ -298,13 +303,14 @@ let operation_is_pure = function
298303
| Istore_int (_, _, _) | Ioffset_loc (_, _)
299304
| Icldemote _ | Iprefetch _ -> false
300305
| Isimd op -> Simd.is_pure op
306+
| Isimd_mem (op, _addr) -> Simd.Mem.is_pure op
301307

302308
(* Specific operations that can raise *)
303309
(* Keep in sync with [Vectorize_specific] *)
304310
let operation_can_raise = function
305311
| Ilea _ | Ibswap _ | Isextend32 | Izextend32
306312
| Ifloatarithmem _
307-
| Irdtsc | Irdpmc | Ipause | Isimd _
313+
| Irdtsc | Irdpmc | Ipause | Isimd _ | Isimd_mem _
308314
| Ilfence | Isfence | Imfence
309315
| Istore_int (_, _, _) | Ioffset_loc (_, _)
310316
| Icldemote _ | Iprefetch _ -> false
@@ -313,7 +319,7 @@ let operation_can_raise = function
313319
let operation_allocates = function
314320
| Ilea _ | Ibswap _ | Isextend32 | Izextend32
315321
| Ifloatarithmem _
316-
| Irdtsc | Irdpmc | Ipause | Isimd _
322+
| Irdtsc | Irdpmc | Ipause | Isimd _ | Isimd_mem _
317323
| Ilfence | Isfence | Imfence
318324
| Istore_int (_, _, _) | Ioffset_loc (_, _)
319325
| Icldemote _ | Iprefetch _ -> false
@@ -404,9 +410,11 @@ let equal_specific_operation left right =
404410
&& equal_addressing_mode left_addr right_addr
405411
| Isimd l, Isimd r ->
406412
Simd.equal_operation l r
413+
| Isimd_mem (l,al), Isimd_mem (r,ar) ->
414+
Simd.Mem.equal_operation l r && equal_addressing_mode al ar
407415
| (Ilea _ | Istore_int _ | Ioffset_loc _ | Ifloatarithmem _ | Ibswap _ |
408416
Isextend32 | Izextend32 | Irdtsc | Irdpmc | Ilfence | Isfence | Imfence |
409-
Ipause | Isimd _ | Icldemote _ | Iprefetch _), _ ->
417+
Ipause | Isimd _ | Isimd_mem _ | Icldemote _ | Iprefetch _), _ ->
410418
false
411419

412420
(* addressing mode functions *)
@@ -511,7 +519,9 @@ let isomorphic_specific_operation op1 op2 =
511519
&& equal_addressing_mode_without_displ left_addr right_addr
512520
| Isimd l, Isimd r ->
513521
Simd.equal_operation l r
522+
| Isimd_mem (l,al), Isimd_mem (r,ar) ->
523+
Simd.Mem.equal_operation l r && equal_addressing_mode_without_displ al ar
514524
| (Ilea _ | Istore_int _ | Ioffset_loc _ | Ifloatarithmem _ | Ibswap _ |
515525
Isextend32 | Izextend32 | Irdtsc | Irdpmc | Ilfence | Isfence | Imfence |
516-
Ipause | Isimd _ | Icldemote _ | Iprefetch _), _ ->
526+
Ipause | Isimd _ | Isimd_mem _ | Icldemote _ | Iprefetch _), _ ->
517527
false

backend/amd64/arch.mli

+3
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ type specific_operation =
8585
| Imfence (* memory fence *)
8686
| Ipause (* hint for spin-wait loops *)
8787
| Isimd of Simd.operation (* SIMD instruction set operations *)
88+
| Isimd_mem of Simd.Mem.operation * addressing_mode
89+
(* SIMD instruction set operations
90+
with memory args *)
8891
| Icldemote of addressing_mode (* hint to demote a cacheline to L3 *)
8992
| Iprefetch of (* memory prefetching hint *)
9093
{ is_write: bool;

backend/amd64/cfg_selection.ml

+8-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,14 @@ let pseudoregs_for_operation op arg res =
8686
edx (high) and eax (low). Make it simple and force the argument in rcx,
8787
and rax and rdx clobbered *)
8888
[| rcx |], res
89-
| Specific (Isimd op) -> Simd_selection.pseudoregs_for_operation op arg res
89+
| Specific (Isimd op) ->
90+
Simd_selection.pseudoregs_for_operation
91+
(Simd_proc.register_behavior op)
92+
arg res
93+
| Specific (Isimd_mem (op, _addr)) ->
94+
Simd_selection.pseudoregs_for_operation
95+
(Simd_proc.Mem.register_behavior op)
96+
arg res
9097
| Csel _ ->
9198
(* last arg must be the same as res.(0) *)
9299
let len = Array.length arg in

backend/amd64/emit.mlp

+21-2
Original file line numberDiff line numberDiff line change
@@ -1034,8 +1034,8 @@ let emit_static_cast (cast : Cmm.static_cast) i =
10341034
CR mslater: (SIMD) don't load 32 bits once we have unboxed int16/int8 *)
10351035
I.movd (arg32 i 0) (res i 0)
10361036

1037-
let emit_simd_instr op i =
1038-
(match Simd_proc.register_behavior op with
1037+
let check_simd_instr (register_behavior : Simd_proc.register_behavior) i =
1038+
(match register_behavior with
10391039
| R_to_fst ->
10401040
assert (Reg.same_loc i.arg.(0) i.res.(0));
10411041
assert (Reg.is_reg i.arg.(0))
@@ -1075,6 +1075,23 @@ let emit_simd_instr op i =
10751075
assert (Reg.is_reg i.arg.(0));
10761076
assert (Reg.same_loc i.res.(0) (phys_xmm0v ()))
10771077
);
1078+
()
1079+
1080+
let emit_simd_instr_with_memory_arg op i addressing_mode =
1081+
check_simd_instr (Simd_proc.Mem.register_behavior op) i;
1082+
let addr = addressing addressing_mode VEC128 i 1 in
1083+
match (op : Simd.Mem.operation) with
1084+
| SSE2 Add_f64 -> I.addpd addr (res i 0)
1085+
| SSE2 Sub_f64 -> I.subpd addr (res i 0)
1086+
| SSE2 Mul_f64 -> I.mulpd addr (res i 0)
1087+
| SSE2 Div_f64 -> I.divpd addr (res i 0)
1088+
| SSE Add_f32 -> I.addps addr (res i 0)
1089+
| SSE Sub_f32 -> I.subps addr (res i 0)
1090+
| SSE Mul_f32 -> I.mulps addr (res i 0)
1091+
| SSE Div_f32 -> I.divps addr (res i 0)
1092+
1093+
let emit_simd_instr op i =
1094+
check_simd_instr (Simd_proc.register_behavior op) i;
10781095
match (op : Simd.operation) with
10791096
| CLMUL (Clmul_64 n) -> I.pclmulqdq (X86_dsl.int n) (arg i 1) (res i 0)
10801097
| BMI2 Extract_64 -> I.pext (arg i 1) (arg i 0) (res i 0)
@@ -1714,6 +1731,8 @@ let emit_instr ~first ~fallthrough i =
17141731
I.mfence ()
17151732
| Lop (Specific (Isimd op)) ->
17161733
emit_simd_instr op i
1734+
| Lop (Specific (Isimd_mem (op, addressing_mode))) ->
1735+
emit_simd_instr_with_memory_arg op i addressing_mode
17171736
| Lop (Static_cast cast) ->
17181737
emit_static_cast cast i
17191738
| Lop (Reinterpret_cast cast) ->

backend/amd64/proc.ml

+34-22
Original file line numberDiff line numberDiff line change
@@ -461,8 +461,8 @@ let destroyed_at_pushtrap =
461461
let has_pushtrap traps =
462462
List.exists (function Cmm.Push _ -> true | Pop _ -> false) traps
463463

464-
let destroyed_by_simd_op op =
465-
match Simd_proc.register_behavior op with
464+
let destroyed_by_simd_op (register_behavior : Simd_proc.register_behavior) =
465+
match register_behavior with
466466
| R_RM_rax_rdx_to_xmm0
467467
| R_RM_to_xmm0 -> destroy_xmm 0
468468
| R_RM_rax_rdx_to_rcx
@@ -496,7 +496,10 @@ let destroyed_at_oper = function
496496
| Ireturn traps when has_pushtrap traps -> assert false
497497
| Iop(Ispecific (Irdtsc | Irdpmc)) -> [| rax; rdx |]
498498
| Iop(Ispecific(Ilfence | Isfence | Imfence)) -> [||]
499-
| Iop(Ispecific(Isimd op)) -> destroyed_by_simd_op op
499+
| Iop(Ispecific(Isimd op)) ->
500+
destroyed_by_simd_op (Simd_proc.register_behavior op)
501+
| Iop(Ispecific(Isimd_mem (op,_))) ->
502+
destroyed_by_simd_op (Simd_proc.Mem.register_behavior op)
500503
| Iop(Ispecific(Isextend32 | Izextend32 | Ilea _
501504
| Istore_int (_, _, _) | Ioffset_loc (_, _)
502505
| Ipause | Icldemote _ | Iprefetch _
@@ -549,7 +552,10 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
549552
| Op Poll -> destroyed_at_alloc_or_poll
550553
| Op (Alloc _) ->
551554
destroyed_at_alloc_or_poll
552-
| Op (Specific (Isimd op)) -> destroyed_by_simd_op op
555+
| Op (Specific (Isimd op)) ->
556+
destroyed_by_simd_op (Simd_proc.register_behavior op)
557+
| Op (Specific (Isimd_mem (op,_))) ->
558+
destroyed_by_simd_op (Simd_proc.Mem.register_behavior op)
553559
| Op (Move | Spill | Reload
554560
| Const_int _ | Const_float _ | Const_float32 _ | Const_symbol _
555561
| Const_vec128 _
@@ -601,7 +607,7 @@ let destroyed_at_terminator (terminator : Cfg_intf.S.terminator) =
601607
| Call {op = Indirect | Direct _; _} -> all_phys_regs
602608
| Specific_can_raise { op = (Ilea _ | Ibswap _ | Isextend32 | Izextend32
603609
| Ifloatarithmem _ | Irdtsc | Irdpmc | Ipause
604-
| Isimd _ | Ilfence | Isfence | Imfence
610+
| Isimd _ | Isimd_mem _ | Ilfence | Isfence | Imfence
605611
| Istore_int (_, _, _) | Ioffset_loc (_, _)
606612
| Icldemote _ | Iprefetch _); _ } ->
607613
Misc.fatal_error "no instructions specific for this architecture can raise"
@@ -631,7 +637,7 @@ let is_destruction_point ~(more_destruction_points : bool) (terminator : Cfg_int
631637
true
632638
| Specific_can_raise { op = (Ilea _ | Ibswap _ | Isextend32 | Izextend32
633639
| Ifloatarithmem _ | Irdtsc | Irdpmc | Ipause
634-
| Isimd _ | Ilfence | Isfence | Imfence
640+
| Isimd _ | Isimd_mem _ | Ilfence | Isfence | Imfence
635641
| Istore_int (_, _, _) | Ioffset_loc (_, _)
636642
| Icldemote _ | Iprefetch _); _ } ->
637643
Misc.fatal_error "no instructions specific for this architecture can raise"
@@ -655,13 +661,29 @@ let safe_register_pressure = function
655661
| Ibeginregion | Iendregion | Idls_get
656662
-> if fp then 10 else 11
657663

658-
let max_register_pressure =
664+
let max_register_pressure op =
659665
let consumes ~int ~float =
660666
if fp
661667
then [| 12 - int; 16 - float |]
662668
else [| 13 - int; 16 - float |]
663-
in function
664-
Iextcall _ ->
669+
in
670+
let simd_max_register_pressure (register_behavior : Simd_proc.register_behavior) =
671+
(match register_behavior with
672+
| R_RM_rax_rdx_to_xmm0
673+
| R_RM_to_xmm0 -> consumes ~int:0 ~float:1
674+
| R_RM_rax_rdx_to_rcx
675+
| R_RM_to_rcx -> consumes ~int:1 ~float:0
676+
| R_to_fst
677+
| R_to_R
678+
| R_to_RM
679+
| RM_to_R
680+
| R_R_to_fst
681+
| R_RM_to_fst
682+
| R_RM_to_R
683+
| R_RM_xmm0_to_fst -> consumes ~int:0 ~float:0)
684+
in
685+
match op with
686+
| Iextcall _ ->
665687
if win64
666688
then consumes ~int:5 ~float:6
667689
else consumes ~int:9 ~float:16
@@ -675,19 +697,9 @@ let max_register_pressure =
675697
| Ifloatop ((Float64 | Float32), Icompf _) ->
676698
consumes ~int:0 ~float:1
677699
| Ispecific(Isimd op) ->
678-
(match Simd_proc.register_behavior op with
679-
| R_RM_rax_rdx_to_xmm0
680-
| R_RM_to_xmm0 -> consumes ~int:0 ~float:1
681-
| R_RM_rax_rdx_to_rcx
682-
| R_RM_to_rcx -> consumes ~int:1 ~float:0
683-
| R_to_fst
684-
| R_to_R
685-
| R_to_RM
686-
| RM_to_R
687-
| R_R_to_fst
688-
| R_RM_to_fst
689-
| R_RM_to_R
690-
| R_RM_xmm0_to_fst -> consumes ~int:0 ~float:0)
700+
simd_max_register_pressure (Simd_proc.register_behavior op)
701+
| Ispecific(Isimd_mem (op,_)) ->
702+
simd_max_register_pressure (Simd_proc.Mem.register_behavior op)
691703
| Iintop(Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
692704
| Ipopcnt|Iclz _| Ictz _)
693705
| Iintop_imm((Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr

backend/amd64/regalloc_stack_operands.ml

+15
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,21 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
176176
may_use_stack_operand_for_second_argument map instr ~num_args:3 ~res_is_fst:true
177177
| R_to_RM -> may_use_stack_operand_for_result map instr ~num_args:1
178178
| RM_to_R -> may_use_stack_operand_for_only_argument map instr ~has_result:true)
179+
| Op (Specific (Isimd_mem (op,_))) ->
180+
(match Simd_proc.Mem.register_behavior op with
181+
| R_RM_to_fst -> May_still_have_spilled_registers
182+
| R_to_fst
183+
| R_to_R
184+
| R_to_RM
185+
| RM_to_R
186+
| R_R_to_fst
187+
| R_RM_to_R
188+
| R_RM_xmm0_to_fst
189+
| R_RM_rax_rdx_to_rcx
190+
| R_RM_to_rcx
191+
| R_RM_rax_rdx_to_xmm0
192+
| R_RM_to_xmm0
193+
-> Misc.fatal_error "Unexpected simd operation with memory arguments")
179194
| Op (Reinterpret_cast (Float_of_float32 | Float32_of_float | V128_of_v128))
180195
| Op (Static_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2))
181196
| Op (Static_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4)) ->

backend/amd64/reload.ml

+1
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ method! reload_operation op arg res =
126126
then (let r = self#makereg res.(0) in (arg, [|r|]))
127127
else (arg, res)
128128
| Ispecific(Isimd op) -> Simd_reload.reload_operation self#makereg op arg res
129+
| Ispecific(Isimd_mem (op,_)) -> Simd_reload.Mem.reload_operation self#makereg op arg res
129130
| Iconst_int n ->
130131
if n <= 0x7FFFFFFFn && n >= -0x80000000n
131132
then (arg, res)

backend/amd64/selection.ml

+8-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,14 @@ let pseudoregs_for_operation op arg res =
8080
edx (high) and eax (low). Make it simple and force the argument in rcx,
8181
and rax and rdx clobbered *)
8282
[| rcx |], res
83-
| Ispecific (Isimd op) -> Simd_selection.pseudoregs_for_operation op arg res
83+
| Ispecific (Isimd op) ->
84+
Simd_selection.pseudoregs_for_operation
85+
(Simd_proc.register_behavior op)
86+
arg res
87+
| Ispecific (Isimd_mem (op, _addr)) ->
88+
Simd_selection.pseudoregs_for_operation
89+
(Simd_proc.Mem.register_behavior op)
90+
arg res
8491
| Icsel _ ->
8592
(* last arg must be the same as res.(0) *)
8693
let len = Array.length arg in

0 commit comments

Comments
 (0)