Skip to content

Commit ec08424

Browse files
authored
Add peephole optimizations for CFG blocks. (ocaml-flambda#1666)
1 parent 9f135a4 commit ec08424

15 files changed

+426
-13
lines changed

backend/.ocamlformat-enable

+2
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,7 @@ asm_targets/**/*.ml
1212
asm_targets/**/*.mli
1313
debug/**/*.ml
1414
debug/**/*.mli
15+
peephole/**/*.ml
16+
peephole/**/*.mli
1517
regalloc/**/*.ml
1618
regalloc/**/*.mli

backend/asmgen.ml

+4
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,10 @@ let compile_fundecl ~ppf_dump ~funcnames fd_cmm =
308308
++ Cfg_with_infos.cfg_with_layout
309309
++ Profile.record ~accumulate:true "cfg_validate_description" (Regalloc_validate.run cfg_description)
310310
++ Profile.record ~accumulate:true "cfg_simplify" Regalloc_utils.simplify_cfg
311+
(* CR-someday gtulbalecu: The peephole optimizations must not affect liveness, otherwise
312+
we would have to recompute it here. Recomputing it here breaks the CI because
313+
the liveness_analysis algorithm does not work properly after register allocation. *)
314+
++ Profile.record ~accumulate:true "peephole_optimize_cfg" Peephole_optimize.peephole_optimize_cfg
311315
++ Profile.record ~accumulate:true "save_cfg" save_cfg
312316
++ Profile.record ~accumulate:true "cfg_reorder_blocks"
313317
(reorder_blocks_random ppf_dump)

backend/peephole/peephole_optimize.ml

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
[@@@ocaml.warning "+a-29-40-41-42"]
2+
3+
module DLL = Flambda_backend_utils.Doubly_linked_list
4+
module R = Peephole_rules
5+
6+
(* We currently don't check that the peephole optimizer terminates. In the case
7+
that the peephole optimization does not terminate we limit the number of
8+
steps to be linear with respect to the block's body (i.e.
9+
O(block_body_length) with a small constant). *)
10+
let termination_cond_const = 5
11+
12+
(* Here cell is an iterator of the doubly linked list data structure that
13+
encapsulates the body's instructions. *)
14+
let rec optimize_body steps_until_termination cell =
15+
if steps_until_termination > 0
16+
then
17+
match R.apply cell with
18+
| None -> (
19+
match DLL.next cell with
20+
| None -> ()
21+
| Some next_cell -> optimize_body (steps_until_termination - 1) next_cell)
22+
| Some continuation_cell ->
23+
optimize_body (steps_until_termination - 1) continuation_cell
24+
25+
(* Apply peephole optimization for the body of each block of the CFG*)
26+
let peephole_optimize_cfg cfg_with_layout =
27+
if !Flambda_backend_flags.cfg_peephole_optimize
28+
then
29+
Cfg.iter_blocks (Cfg_with_layout.cfg cfg_with_layout)
30+
~f:(fun (_ : int) block ->
31+
Option.iter
32+
(optimize_body (termination_cond_const * DLL.length block.body))
33+
(DLL.hd_cell block.body));
34+
cfg_with_layout
+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[@@@ocaml.warning "+a-30-40-41-42"]
2+
3+
val peephole_optimize_cfg : Cfg_with_layout.t -> Cfg_with_layout.t

backend/peephole/peephole_rules.ml

+199
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
(* CR-someday: see whether the `-4` can be dropped. *)
2+
[@@@ocaml.warning "+a-29-40-41-42-4"]
3+
4+
module DLL = Flambda_backend_utils.Doubly_linked_list
5+
module U = Peephole_utils
6+
7+
(** Logical condition for simplifying the following case:
8+
{|
9+
mov x, y
10+
mov y, x
11+
|}
12+
13+
In this case, the second instruction should be removed *)
14+
15+
let remove_useless_mov (cell : Cfg.basic Cfg.instruction DLL.cell) =
16+
match U.get_cells cell 2 with
17+
| [fst; snd] -> (
18+
let fst_val = DLL.value fst in
19+
let snd_val = DLL.value snd in
20+
match fst_val.desc with
21+
| Op (Move | Spill | Reload) -> (
22+
let fst_src, fst_dst = fst_val.arg.(0), fst_val.res.(0) in
23+
match snd_val.desc with
24+
| Op (Move | Spill | Reload) ->
25+
let snd_src, snd_dst = snd_val.arg.(0), snd_val.res.(0) in
26+
if U.are_equal_regs fst_src snd_dst && U.are_equal_regs fst_dst snd_src
27+
then (
28+
DLL.delete_curr snd;
29+
Some (U.prev_at_most U.go_back_const fst))
30+
else None
31+
| _ -> None)
32+
| _ -> None)
33+
| _ -> None
34+
35+
(** Logical condition for simplifying the following case:
36+
{|
37+
<op 1> const1, r
38+
<op 2> const2, r
39+
|}
40+
41+
to:
42+
{|
43+
<op 1> (const1 <op 2> const2), r
44+
|}
45+
46+
Where <op 1> and <op 2> can be any two binary operators that are associative and commutative
47+
and const1 and const2 are immediate values. *)
48+
49+
let are_compatible op1 op2 imm1 imm2 =
50+
match (op1 : Mach.integer_operation), (op2 : Mach.integer_operation) with
51+
(* Folding two bitwise operations such as (AND, OR, XOR) should never produce
52+
an overflow so we assert this conditon. *)
53+
| Mach.Iand, Mach.Iand ->
54+
assert (U.amd64_imm32_within_bounds imm1 imm2 ( land ));
55+
Some (Mach.Iand, imm1 land imm2)
56+
| Ior, Ior ->
57+
assert (U.amd64_imm32_within_bounds imm1 imm2 ( lor ));
58+
Some (Mach.Ior, imm1 lor imm2)
59+
| Ixor, Ixor ->
60+
assert (U.amd64_imm32_within_bounds imm1 imm2 ( lxor ));
61+
Some (Mach.Ixor, imm1 lxor imm2)
62+
(* For the following three cases we have the issue that in some situations,
63+
one or both immediate values could be out of bounds, but the result might
64+
be within bounds (e.g. imm1 = -4 and imm2 = 65, their sum being 61). This
65+
should not happen at all since the immediate values should always be within
66+
the bounds [0, Sys.int_size]. *)
67+
| Ilsl, Ilsl ->
68+
if Misc.no_overflow_add imm1 imm2 && imm1 + imm2 <= Sys.int_size
69+
then (
70+
U.bitwise_shift_assert imm1 imm2;
71+
Some (Mach.Ilsl, imm1 + imm2))
72+
else None
73+
| Ilsr, Ilsr ->
74+
if Misc.no_overflow_add imm1 imm2 && imm1 + imm2 <= Sys.int_size
75+
then (
76+
U.bitwise_shift_assert imm1 imm2;
77+
Some (Mach.Ilsr, imm1 + imm2))
78+
else None
79+
| Iasr, Iasr ->
80+
if Misc.no_overflow_add imm1 imm2 && imm1 + imm2 <= Sys.int_size
81+
then (
82+
U.bitwise_shift_assert imm1 imm2;
83+
Some (Mach.Iasr, imm1 + imm2))
84+
else None
85+
(* for the amd64 instruction set the `ADD` `SUB` `MUL` opperations take at
86+
most an imm32 as the second argument, so we need to check for overflows on
87+
32-bit signed ints. *)
88+
(* CR-someday gtulba-lecu: This condition is architecture specific and should
89+
either live in amd64 specific code or this module should contain
90+
information about the architecture target. *)
91+
| Iadd, Iadd ->
92+
if Misc.no_overflow_add imm1 imm2
93+
&& U.amd64_imm32_within_bounds imm1 imm2 ( + )
94+
then Some (Mach.Iadd, imm1 + imm2)
95+
else None
96+
| Iadd, Isub ->
97+
if imm1 >= imm2
98+
then
99+
if Misc.no_overflow_sub imm1 imm2
100+
&& U.amd64_imm32_within_bounds imm1 imm2 ( - )
101+
then Some (Mach.Iadd, imm1 - imm2)
102+
else None
103+
else if Misc.no_overflow_sub imm2 imm1
104+
&& U.amd64_imm32_within_bounds imm2 imm1 ( - )
105+
then Some (Mach.Isub, imm2 - imm1)
106+
else None
107+
| Isub, Isub ->
108+
if Misc.no_overflow_add imm1 imm2
109+
&& U.amd64_imm32_within_bounds imm1 imm2 ( + )
110+
then Some (Mach.Isub, imm1 + imm2)
111+
else None
112+
| Isub, Iadd ->
113+
if imm1 >= imm2
114+
then
115+
if Misc.no_overflow_sub imm1 imm2
116+
&& U.amd64_imm32_within_bounds imm1 imm2 ( - )
117+
then Some (Mach.Isub, imm1 - imm2)
118+
else None
119+
else if Misc.no_overflow_sub imm2 imm1
120+
&& U.amd64_imm32_within_bounds imm2 imm1 ( - )
121+
then Some (Mach.Iadd, imm2 - imm1)
122+
else None
123+
| Ilsl, Imul ->
124+
if imm1 >= 0 && imm1 < 31
125+
&& Misc.no_overflow_mul (1 lsl imm1) imm2
126+
&& U.amd64_imm32_within_bounds (1 lsl imm1) imm2 ( * )
127+
then Some (Mach.Imul, (1 lsl imm1) * imm2)
128+
else None
129+
| Imul, Ilsl ->
130+
if imm2 >= 0 && imm2 < 31
131+
&& Misc.no_overflow_mul imm1 (1 lsl imm2)
132+
&& U.amd64_imm32_within_bounds imm1 (1 lsl imm2) ( * )
133+
then Some (Mach.Imul, imm1 * (1 lsl imm2))
134+
else None
135+
| Imul, Imul ->
136+
if Misc.no_overflow_mul imm1 imm2
137+
&& U.amd64_imm32_within_bounds imm1 imm2 ( * )
138+
then Some (Mach.Imul, imm1 * imm2)
139+
else None
140+
(* CR-soon gtulba-lecu: check this last case | Imod, Imod -> if imm1 mod imm2
141+
= 0 then Some (Mach.Imod, imm2) else None
142+
143+
The integer modulo imm2 group is a subgroup of the integer modulo imm1 iff
144+
imm2 divides imm1
145+
146+
This is because the operations in the groups are addition modulo n and m
147+
respectively. If n divides m, then every result of the operation (addition)
148+
in the n group will also be a legal result in the m group, which is
149+
essentially the definition of a subgroup. If n does not divide m, there
150+
will be some results in the n group that are not acceptable in the m
151+
group. *)
152+
| _ -> None
153+
154+
let fold_intop_imm (cell : Cfg.basic Cfg.instruction DLL.cell) =
155+
match U.get_cells cell 2 with
156+
| [fst; snd] ->
157+
let fst_val = DLL.value fst in
158+
let snd_val = DLL.value snd in
159+
(* The following check does the following: 1. Ensures that both instructions
160+
use the same source register; 2. Ensures that both instructions output
161+
the result to the source register, this is redundant for amd64 since
162+
there are no instructions that invalidate this condition. *)
163+
(* CR-someday gtulba-lecu: This condition is architecture specific and
164+
should either live in amd64 specific code or this module should contain
165+
information about the architecture target. *)
166+
if Array.length fst_val.arg = 1
167+
&& Array.length snd_val.arg = 1
168+
&& Array.length fst_val.res = 1
169+
&& Array.length snd_val.res = 1
170+
&& U.are_equal_regs
171+
(Array.unsafe_get fst_val.arg 0)
172+
(Array.unsafe_get snd_val.arg 0)
173+
&& U.are_equal_regs
174+
(Array.unsafe_get fst_val.arg 0)
175+
(Array.unsafe_get fst_val.res 0)
176+
&& U.are_equal_regs
177+
(Array.unsafe_get snd_val.arg 0)
178+
(Array.unsafe_get snd_val.res 0)
179+
then
180+
match fst_val.desc, snd_val.desc with
181+
| Op (Intop_imm (op1, imm1)), Op (Intop_imm (op2, imm2)) -> (
182+
match are_compatible op1 op2 imm1 imm2 with
183+
| Some (op, imm) ->
184+
let new_cell =
185+
DLL.insert_and_return_before fst
186+
{ fst_val with desc = Cfg.Op (Intop_imm (op, imm)) }
187+
in
188+
DLL.delete_curr fst;
189+
DLL.delete_curr snd;
190+
Some ((U.prev_at_most U.go_back_const) new_cell)
191+
| _ -> None)
192+
| _ -> None
193+
else None
194+
| _ -> None
195+
196+
let apply cell =
197+
match remove_useless_mov cell with
198+
| None -> ( match fold_intop_imm cell with None -> None | res -> res)
199+
| res -> res

backend/peephole/peephole_rules.mli

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[@@@ocaml.warning "+a-29-40-41-42"]
2+
3+
open! Peephole_utils
4+
5+
val apply :
6+
Cfg.basic Cfg.instruction DLL.cell ->
7+
Cfg.basic Cfg.instruction DLL.cell option

backend/peephole/peephole_utils.ml

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
module DLL = Flambda_backend_utils.Doubly_linked_list
2+
3+
(* CR-someday gtulba-lecu: make sure that this comparison is correct and
4+
sufficent. Take into consideration using Proc.regs_are_volatile in the
5+
future. As we only support amd64 and Proc.regs_are_volatile is always false
6+
in amd64 this is not necessary for now. See backend/cfg/cfg_deadcode.ml for
7+
more details.*)
8+
let are_equal_regs (reg1 : Reg.t) (reg2 : Reg.t) =
9+
Reg.same_loc reg1 reg2 && reg1.typ = reg2.typ
10+
11+
(* CR-soon gtulba-lecu: Delete this when imeplementing auto-generated rules. *)
12+
let go_back_const = 1
13+
14+
let rec prev_at_most steps cell =
15+
(* Convention: must try to go back at least one element *)
16+
assert (steps > 0);
17+
match DLL.prev cell with
18+
| Some prev_cell ->
19+
if steps = 1 then prev_cell else prev_at_most (steps - 1) prev_cell
20+
| None -> cell
21+
22+
let rec get_cells' (cell : Cfg.basic Cfg.instruction DLL.cell option) size lst =
23+
match cell with
24+
| Some cell -> (
25+
match size with
26+
| 0 -> List.rev lst
27+
| size -> get_cells' (DLL.next cell) (size - 1) (cell :: lst))
28+
| None -> List.rev lst
29+
30+
let get_cells cell size =
31+
assert (size > 0);
32+
get_cells' (DLL.next cell) (size - 1) [cell]
33+
34+
let is_bitwise_op (op : Mach.integer_operation) =
35+
match op with
36+
| Mach.Iand | Ior | Ixor | Ilsl | Ilsr | Iasr -> true
37+
| _ -> false
38+
[@@ocaml.warning "-4"]
39+
40+
let bitwise_shift_assert (imm1 : int) (imm2 : int) =
41+
if imm1 < 0 || imm1 > Sys.int_size || imm2 < 0 || imm2 > Sys.int_size
42+
then assert false
43+
[@@inline]
44+
45+
(* CR-someday gtulba-lecu: This is architecture specific and should be moved in
46+
a different part of the compiler that is specific to the amd64 architecture.
47+
This is fine for now as we only support amd64. *)
48+
let amd64_imm32_within_bounds imm1 imm2 op =
49+
let imm = op imm1 imm2 in
50+
Int32.to_int Int32.min_int <= imm && imm <= Int32.to_int Int32.max_int
51+
[@@inline]

backend/peephole/peephole_utils.mli

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[@@@ocaml.warning "+a-29-40-41-42"]
2+
3+
module DLL = Flambda_backend_utils.Doubly_linked_list
4+
5+
val are_equal_regs : Reg.t -> Reg.t -> bool
6+
7+
val go_back_const : int
8+
9+
val prev_at_most : int -> 'a DLL.cell -> 'a DLL.cell
10+
11+
val get_cells :
12+
Cfg.basic Cfg.instruction DLL.cell ->
13+
int ->
14+
Cfg.basic Cfg.instruction DLL.cell list
15+
16+
val is_bitwise_op : Mach.integer_operation -> bool
17+
18+
val bitwise_shift_assert : int -> int -> unit
19+
20+
val amd64_imm32_within_bounds : int -> int -> (int -> int -> int) -> bool

driver/flambda_backend_args.ml

+16
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ let mk_regalloc_validate f =
5151
let mk_no_regalloc_validate f =
5252
"-no-regalloc-validate", Arg.Unit f, " Do not validate register allocation"
5353

54+
let mk_cfg_peephole_optimize f =
55+
"-cfg-peephole-optimize", Arg.Unit f, " Apply peephole optimizations to CFG"
56+
57+
let mk_no_cfg_peephole_optimize f =
58+
"-no-cfg-peephole-optimize", Arg.Unit f, " Do not apply peephole optimizations to CFG"
59+
5460
let mk_reorder_blocks_random f =
5561
"-reorder-blocks-random",
5662
Arg.Int f,
@@ -534,6 +540,9 @@ module type Flambda_backend_options = sig
534540
val regalloc_validate : unit -> unit
535541
val no_regalloc_validate : unit -> unit
536542

543+
val cfg_peephole_optimize : unit -> unit
544+
val no_cfg_peephole_optimize : unit -> unit
545+
537546
val reorder_blocks_random : int -> unit
538547
val basic_block_sections : unit -> unit
539548

@@ -631,6 +640,9 @@ struct
631640
mk_regalloc_validate F.regalloc_validate;
632641
mk_no_regalloc_validate F.no_regalloc_validate;
633642

643+
mk_cfg_peephole_optimize F.cfg_peephole_optimize;
644+
mk_no_cfg_peephole_optimize F.no_cfg_peephole_optimize;
645+
634646
mk_reorder_blocks_random F.reorder_blocks_random;
635647
mk_basic_block_sections F.basic_block_sections;
636648

@@ -757,6 +769,9 @@ module Flambda_backend_options_impl = struct
757769
let regalloc_validate = set' Flambda_backend_flags.regalloc_validate
758770
let no_regalloc_validate = clear' Flambda_backend_flags.regalloc_validate
759771

772+
let cfg_peephole_optimize = set' Flambda_backend_flags.cfg_peephole_optimize
773+
let no_cfg_peephole_optimize = clear' Flambda_backend_flags.cfg_peephole_optimize
774+
760775
let reorder_blocks_random seed =
761776
Flambda_backend_flags.reorder_blocks_random := Some seed
762777
let basic_block_sections () =
@@ -1009,6 +1024,7 @@ module Extra_params = struct
10091024
| "regalloc" -> set_string Flambda_backend_flags.regalloc
10101025
| "regalloc-param" -> add_string Flambda_backend_flags.regalloc_params
10111026
| "regalloc-validate" -> set' Flambda_backend_flags.regalloc_validate
1027+
| "cfg-peephole-optimize" -> set' Flambda_backend_flags.cfg_peephole_optimize
10121028
| "dump-inlining-paths" -> set' Flambda_backend_flags.dump_inlining_paths
10131029
| "davail" -> set' Flambda_backend_flags.davail
10141030
| "reorder-blocks-random" ->

driver/flambda_backend_args.mli

+3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ module type Flambda_backend_options = sig
3232
val regalloc_validate : unit -> unit
3333
val no_regalloc_validate : unit -> unit
3434

35+
val cfg_peephole_optimize : unit -> unit
36+
val no_cfg_peephole_optimize : unit -> unit
37+
3538
val reorder_blocks_random : int -> unit
3639
val basic_block_sections : unit -> unit
3740

0 commit comments

Comments
 (0)