Add peephole optimizations for CFG blocks. (ocaml-flambda#1666)

GabiTulba · web-flow · commit ec08424af136 · 2023-08-03T15:49:13.000+01:00
diff --git a/backend/.ocamlformat-enable b/backend/.ocamlformat-enable
@@ -12,5 +12,7 @@ asm_targets/**/*.ml
 asm_targets/**/*.mli
 debug/**/*.ml
 debug/**/*.mli
+peephole/**/*.ml
+peephole/**/*.mli
 regalloc/**/*.ml
 regalloc/**/*.mli
diff --git a/backend/asmgen.ml b/backend/asmgen.ml
@@ -308,6 +308,10 @@ let compile_fundecl ~ppf_dump ~funcnames fd_cmm =
         ++ Cfg_with_infos.cfg_with_layout
         ++ Profile.record ~accumulate:true "cfg_validate_description" (Regalloc_validate.run cfg_description)
         ++ Profile.record ~accumulate:true "cfg_simplify" Regalloc_utils.simplify_cfg
+          (* CR-someday gtulbalecu: The peephole optimizations must not affect liveness, otherwise
+             we would have to recompute it here. Recomputing it here breaks the CI because
+             the liveness_analysis algorithm does not work properly after register allocation. *)
+        ++ Profile.record ~accumulate:true "peephole_optimize_cfg" Peephole_optimize.peephole_optimize_cfg
         ++ Profile.record ~accumulate:true "save_cfg" save_cfg
         ++ Profile.record ~accumulate:true "cfg_reorder_blocks"
              (reorder_blocks_random ppf_dump)
diff --git a/backend/peephole/peephole_optimize.ml b/backend/peephole/peephole_optimize.ml
@@ -0,0 +1,34 @@
+[@@@ocaml.warning "+a-29-40-41-42"]
+
+module DLL = Flambda_backend_utils.Doubly_linked_list
+module R = Peephole_rules
+
+(* We currently don't check that the peephole optimizer terminates. In the case
+   that the peephole optimization does not terminate we limit the number of
+   steps to be linear with respect to the block's body (i.e.
+   O(block_body_length) with a small constant). *)
+let termination_cond_const = 5
+
+(* Here cell is an iterator of the doubly linked list data structure that
+   encapsulates the body's instructions. *)
+let rec optimize_body steps_until_termination cell =
+  if steps_until_termination > 0
+  then
+    match R.apply cell with
+    | None -> (
+      match DLL.next cell with
+      | None -> ()
+      | Some next_cell -> optimize_body (steps_until_termination - 1) next_cell)
+    | Some continuation_cell ->
+      optimize_body (steps_until_termination - 1) continuation_cell
+
+(* Apply peephole optimization for the body of each block of the CFG*)
+let peephole_optimize_cfg cfg_with_layout =
+  if !Flambda_backend_flags.cfg_peephole_optimize
+  then
+    Cfg.iter_blocks (Cfg_with_layout.cfg cfg_with_layout)
+      ~f:(fun (_ : int) block ->
+        Option.iter
+          (optimize_body (termination_cond_const * DLL.length block.body))
+          (DLL.hd_cell block.body));
+  cfg_with_layout
diff --git a/backend/peephole/peephole_optimize.mli b/backend/peephole/peephole_optimize.mli
@@ -0,0 +1,3 @@
+[@@@ocaml.warning "+a-30-40-41-42"]
+
+val peephole_optimize_cfg : Cfg_with_layout.t -> Cfg_with_layout.t
diff --git a/backend/peephole/peephole_rules.ml b/backend/peephole/peephole_rules.ml
@@ -0,0 +1,199 @@
+(* CR-someday: see whether the `-4` can be dropped. *)
+[@@@ocaml.warning "+a-29-40-41-42-4"]
+
+module DLL = Flambda_backend_utils.Doubly_linked_list
+module U = Peephole_utils
+
+(** Logical condition for simplifying the following case:
+    {|
+    mov x, y
+    mov y, x
+    |}
+
+    In this case, the second instruction should be removed *)
+
+let remove_useless_mov (cell : Cfg.basic Cfg.instruction DLL.cell) =
+  match U.get_cells cell 2 with
+  | [fst; snd] -> (
+    let fst_val = DLL.value fst in
+    let snd_val = DLL.value snd in
+    match fst_val.desc with
+    | Op (Move | Spill | Reload) -> (
+      let fst_src, fst_dst = fst_val.arg.(0), fst_val.res.(0) in
+      match snd_val.desc with
+      | Op (Move | Spill | Reload) ->
+        let snd_src, snd_dst = snd_val.arg.(0), snd_val.res.(0) in
+        if U.are_equal_regs fst_src snd_dst && U.are_equal_regs fst_dst snd_src
+        then (
+          DLL.delete_curr snd;
+          Some (U.prev_at_most U.go_back_const fst))
+        else None
+      | _ -> None)
+    | _ -> None)
+  | _ -> None
+
+(** Logical condition for simplifying the following case:
+  {|
+    <op 1> const1, r
+    <op 2> const2, r
+  |}
+
+  to:
+  {|
+    <op 1> (const1 <op 2> const2), r
+  |}
+
+   Where <op 1> and <op 2> can be any two binary operators that are associative and commutative
+   and const1 and const2 are immediate values. *)
+
+let are_compatible op1 op2 imm1 imm2 =
+  match (op1 : Mach.integer_operation), (op2 : Mach.integer_operation) with
+  (* Folding two bitwise operations such as (AND, OR, XOR) should never produce
+     an overflow so we assert this conditon. *)
+  | Mach.Iand, Mach.Iand ->
+    assert (U.amd64_imm32_within_bounds imm1 imm2 ( land ));
+    Some (Mach.Iand, imm1 land imm2)
+  | Ior, Ior ->
+    assert (U.amd64_imm32_within_bounds imm1 imm2 ( lor ));
+    Some (Mach.Ior, imm1 lor imm2)
+  | Ixor, Ixor ->
+    assert (U.amd64_imm32_within_bounds imm1 imm2 ( lxor ));
+    Some (Mach.Ixor, imm1 lxor imm2)
+  (* For the following three cases we have the issue that in some situations,
+     one or both immediate values could be out of bounds, but the result might
+     be within bounds (e.g. imm1 = -4 and imm2 = 65, their sum being 61). This
+     should not happen at all since the immediate values should always be within
+     the bounds [0, Sys.int_size]. *)
+  | Ilsl, Ilsl ->
+    if Misc.no_overflow_add imm1 imm2 && imm1 + imm2 <= Sys.int_size
+    then (
+      U.bitwise_shift_assert imm1 imm2;
+      Some (Mach.Ilsl, imm1 + imm2))
+    else None
+  | Ilsr, Ilsr ->
+    if Misc.no_overflow_add imm1 imm2 && imm1 + imm2 <= Sys.int_size
+    then (
+      U.bitwise_shift_assert imm1 imm2;
+      Some (Mach.Ilsr, imm1 + imm2))
+    else None
+  | Iasr, Iasr ->
+    if Misc.no_overflow_add imm1 imm2 && imm1 + imm2 <= Sys.int_size
+    then (
+      U.bitwise_shift_assert imm1 imm2;
+      Some (Mach.Iasr, imm1 + imm2))
+    else None
+  (* for the amd64 instruction set the `ADD` `SUB` `MUL` opperations take at
+     most an imm32 as the second argument, so we need to check for overflows on
+     32-bit signed ints. *)
+  (* CR-someday gtulba-lecu: This condition is architecture specific and should
+     either live in amd64 specific code or this module should contain
+     information about the architecture target. *)
+  | Iadd, Iadd ->
+    if Misc.no_overflow_add imm1 imm2
+       && U.amd64_imm32_within_bounds imm1 imm2 ( + )
+    then Some (Mach.Iadd, imm1 + imm2)
+    else None
+  | Iadd, Isub ->
+    if imm1 >= imm2
+    then
+      if Misc.no_overflow_sub imm1 imm2
+         && U.amd64_imm32_within_bounds imm1 imm2 ( - )
+      then Some (Mach.Iadd, imm1 - imm2)
+      else None
+    else if Misc.no_overflow_sub imm2 imm1
+            && U.amd64_imm32_within_bounds imm2 imm1 ( - )
+    then Some (Mach.Isub, imm2 - imm1)
+    else None
+  | Isub, Isub ->
+    if Misc.no_overflow_add imm1 imm2
+       && U.amd64_imm32_within_bounds imm1 imm2 ( + )
+    then Some (Mach.Isub, imm1 + imm2)
+    else None
+  | Isub, Iadd ->
+    if imm1 >= imm2
+    then
+      if Misc.no_overflow_sub imm1 imm2
+         && U.amd64_imm32_within_bounds imm1 imm2 ( - )
+      then Some (Mach.Isub, imm1 - imm2)
+      else None
+    else if Misc.no_overflow_sub imm2 imm1
+            && U.amd64_imm32_within_bounds imm2 imm1 ( - )
+    then Some (Mach.Iadd, imm2 - imm1)
+    else None
+  | Ilsl, Imul ->
+    if imm1 >= 0 && imm1 < 31
+       && Misc.no_overflow_mul (1 lsl imm1) imm2
+       && U.amd64_imm32_within_bounds (1 lsl imm1) imm2 ( * )
+    then Some (Mach.Imul, (1 lsl imm1) * imm2)
+    else None
+  | Imul, Ilsl ->
+    if imm2 >= 0 && imm2 < 31
+       && Misc.no_overflow_mul imm1 (1 lsl imm2)
+       && U.amd64_imm32_within_bounds imm1 (1 lsl imm2) ( * )
+    then Some (Mach.Imul, imm1 * (1 lsl imm2))
+    else None
+  | Imul, Imul ->
+    if Misc.no_overflow_mul imm1 imm2
+       && U.amd64_imm32_within_bounds imm1 imm2 ( * )
+    then Some (Mach.Imul, imm1 * imm2)
+    else None
+  (* CR-soon gtulba-lecu: check this last case | Imod, Imod -> if imm1 mod imm2
+     = 0 then Some (Mach.Imod, imm2) else None
+
+     The integer modulo imm2 group is a subgroup of the integer modulo imm1 iff
+     imm2 divides imm1
+
+     This is because the operations in the groups are addition modulo n and m
+     respectively. If n divides m, then every result of the operation (addition)
+     in the n group will also be a legal result in the m group, which is
+     essentially the definition of a subgroup. If n does not divide m, there
+     will be some results in the n group that are not acceptable in the m
+     group. *)
+  | _ -> None
+
+let fold_intop_imm (cell : Cfg.basic Cfg.instruction DLL.cell) =
+  match U.get_cells cell 2 with
+  | [fst; snd] ->
+    let fst_val = DLL.value fst in
+    let snd_val = DLL.value snd in
+    (* The following check does the following: 1. Ensures that both instructions
+       use the same source register; 2. Ensures that both instructions output
+       the result to the source register, this is redundant for amd64 since
+       there are no instructions that invalidate this condition. *)
+    (* CR-someday gtulba-lecu: This condition is architecture specific and
+       should either live in amd64 specific code or this module should contain
+       information about the architecture target. *)
+    if Array.length fst_val.arg = 1
+       && Array.length snd_val.arg = 1
+       && Array.length fst_val.res = 1
+       && Array.length snd_val.res = 1
+       && U.are_equal_regs
+            (Array.unsafe_get fst_val.arg 0)
+            (Array.unsafe_get snd_val.arg 0)
+       && U.are_equal_regs
+            (Array.unsafe_get fst_val.arg 0)
+            (Array.unsafe_get fst_val.res 0)
+       && U.are_equal_regs
+            (Array.unsafe_get snd_val.arg 0)
+            (Array.unsafe_get snd_val.res 0)
+    then
+      match fst_val.desc, snd_val.desc with
+      | Op (Intop_imm (op1, imm1)), Op (Intop_imm (op2, imm2)) -> (
+        match are_compatible op1 op2 imm1 imm2 with
+        | Some (op, imm) ->
+          let new_cell =
+            DLL.insert_and_return_before fst
+              { fst_val with desc = Cfg.Op (Intop_imm (op, imm)) }
+          in
+          DLL.delete_curr fst;
+          DLL.delete_curr snd;
+          Some ((U.prev_at_most U.go_back_const) new_cell)
+        | _ -> None)
+      | _ -> None
+    else None
+  | _ -> None
+
+let apply cell =
+  match remove_useless_mov cell with
+  | None -> ( match fold_intop_imm cell with None -> None | res -> res)
+  | res -> res
diff --git a/backend/peephole/peephole_rules.mli b/backend/peephole/peephole_rules.mli
@@ -0,0 +1,7 @@
+[@@@ocaml.warning "+a-29-40-41-42"]
+
+open! Peephole_utils
+
+val apply :
+  Cfg.basic Cfg.instruction DLL.cell ->
+  Cfg.basic Cfg.instruction DLL.cell option
diff --git a/backend/peephole/peephole_utils.ml b/backend/peephole/peephole_utils.ml
@@ -0,0 +1,51 @@
+module DLL = Flambda_backend_utils.Doubly_linked_list
+
+(* CR-someday gtulba-lecu: make sure that this comparison is correct and
+   sufficent. Take into consideration using Proc.regs_are_volatile in the
+   future. As we only support amd64 and Proc.regs_are_volatile is always false
+   in amd64 this is not necessary for now. See backend/cfg/cfg_deadcode.ml for
+   more details.*)
+let are_equal_regs (reg1 : Reg.t) (reg2 : Reg.t) =
+  Reg.same_loc reg1 reg2 && reg1.typ = reg2.typ
+
+(* CR-soon gtulba-lecu: Delete this when imeplementing auto-generated rules. *)
+let go_back_const = 1
+
+let rec prev_at_most steps cell =
+  (* Convention: must try to go back at least one element *)
+  assert (steps > 0);
+  match DLL.prev cell with
+  | Some prev_cell ->
+    if steps = 1 then prev_cell else prev_at_most (steps - 1) prev_cell
+  | None -> cell
+
+let rec get_cells' (cell : Cfg.basic Cfg.instruction DLL.cell option) size lst =
+  match cell with
+  | Some cell -> (
+    match size with
+    | 0 -> List.rev lst
+    | size -> get_cells' (DLL.next cell) (size - 1) (cell :: lst))
+  | None -> List.rev lst
+
+let get_cells cell size =
+  assert (size > 0);
+  get_cells' (DLL.next cell) (size - 1) [cell]
+
+let is_bitwise_op (op : Mach.integer_operation) =
+  match op with
+  | Mach.Iand | Ior | Ixor | Ilsl | Ilsr | Iasr -> true
+  | _ -> false
+  [@@ocaml.warning "-4"]
+
+let bitwise_shift_assert (imm1 : int) (imm2 : int) =
+  if imm1 < 0 || imm1 > Sys.int_size || imm2 < 0 || imm2 > Sys.int_size
+  then assert false
+  [@@inline]
+
+(* CR-someday gtulba-lecu: This is architecture specific and should be moved in
+   a different part of the compiler that is specific to the amd64 architecture.
+   This is fine for now as we only support amd64. *)
+let amd64_imm32_within_bounds imm1 imm2 op =
+  let imm = op imm1 imm2 in
+  Int32.to_int Int32.min_int <= imm && imm <= Int32.to_int Int32.max_int
+  [@@inline]
diff --git a/backend/peephole/peephole_utils.mli b/backend/peephole/peephole_utils.mli
@@ -0,0 +1,20 @@
+[@@@ocaml.warning "+a-29-40-41-42"]
+
+module DLL = Flambda_backend_utils.Doubly_linked_list
+
+val are_equal_regs : Reg.t -> Reg.t -> bool
+
+val go_back_const : int
+
+val prev_at_most : int -> 'a DLL.cell -> 'a DLL.cell
+
+val get_cells :
+  Cfg.basic Cfg.instruction DLL.cell ->
+  int ->
+  Cfg.basic Cfg.instruction DLL.cell list
+
+val is_bitwise_op : Mach.integer_operation -> bool
+
+val bitwise_shift_assert : int -> int -> unit
+
+val amd64_imm32_within_bounds : int -> int -> (int -> int -> int) -> bool
diff --git a/driver/flambda_backend_args.ml b/driver/flambda_backend_args.ml
@@ -51,6 +51,12 @@ let mk_regalloc_validate f =
 let mk_no_regalloc_validate f =
   "-no-regalloc-validate", Arg.Unit f, " Do not validate register allocation"
 
+let mk_cfg_peephole_optimize f = 
+  "-cfg-peephole-optimize", Arg.Unit f, " Apply peephole optimizations to CFG"
+
+let mk_no_cfg_peephole_optimize f = 
+  "-no-cfg-peephole-optimize", Arg.Unit f, " Do not apply peephole optimizations to CFG"
+
 let mk_reorder_blocks_random f =
   "-reorder-blocks-random",
   Arg.Int f,
@@ -534,6 +540,9 @@ module type Flambda_backend_options = sig
   val regalloc_validate : unit -> unit
   val no_regalloc_validate : unit -> unit
 
+  val cfg_peephole_optimize : unit -> unit
+  val no_cfg_peephole_optimize : unit -> unit
+
   val reorder_blocks_random : int -> unit
   val basic_block_sections : unit -> unit
 
@@ -631,6 +640,9 @@ struct
     mk_regalloc_validate F.regalloc_validate;
     mk_no_regalloc_validate F.no_regalloc_validate;
 
+    mk_cfg_peephole_optimize F.cfg_peephole_optimize;
+    mk_no_cfg_peephole_optimize F.no_cfg_peephole_optimize;
+
     mk_reorder_blocks_random F.reorder_blocks_random;
     mk_basic_block_sections F.basic_block_sections;
 
@@ -757,6 +769,9 @@ module Flambda_backend_options_impl = struct
   let regalloc_validate = set' Flambda_backend_flags.regalloc_validate
   let no_regalloc_validate = clear' Flambda_backend_flags.regalloc_validate
 
+  let cfg_peephole_optimize = set' Flambda_backend_flags.cfg_peephole_optimize
+  let no_cfg_peephole_optimize = clear' Flambda_backend_flags.cfg_peephole_optimize
+
   let reorder_blocks_random seed =
     Flambda_backend_flags.reorder_blocks_random := Some seed
   let basic_block_sections () =
@@ -1009,6 +1024,7 @@ module Extra_params = struct
     | "regalloc" -> set_string Flambda_backend_flags.regalloc
     | "regalloc-param" -> add_string Flambda_backend_flags.regalloc_params
     | "regalloc-validate" -> set' Flambda_backend_flags.regalloc_validate
+    | "cfg-peephole-optimize" -> set' Flambda_backend_flags.cfg_peephole_optimize
     | "dump-inlining-paths" -> set' Flambda_backend_flags.dump_inlining_paths
     | "davail" -> set' Flambda_backend_flags.davail
     | "reorder-blocks-random" ->
diff --git a/driver/flambda_backend_args.mli b/driver/flambda_backend_args.mli
@@ -32,6 +32,9 @@ module type Flambda_backend_options = sig
   val regalloc_validate : unit -> unit
   val no_regalloc_validate : unit -> unit
 
+  val cfg_peephole_optimize : unit -> unit
+  val no_cfg_peephole_optimize : unit -> unit
+
   val reorder_blocks_random : int -> unit
   val basic_block_sections : unit -> unit
 
diff --git a/driver/flambda_backend_flags.ml b/driver/flambda_backend_flags.ml
diff --git a/driver/flambda_backend_flags.mli b/driver/flambda_backend_flags.mli
diff --git a/dune b/dune
diff --git a/utils/doubly_linked_list.ml b/utils/doubly_linked_list.ml
diff --git a/utils/doubly_linked_list.mli b/utils/doubly_linked_list.mli

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[@@@ocaml.warning "+a-30-40-41-42"]`
	`2`	`+`
	`3`	`+val peephole_optimize_cfg : Cfg_with_layout.t -> Cfg_with_layout.t`