Skip to content

Commit 5b2ae83

Browse files
authored
Cranelift: Use a fixpoint loop to compute the best value for each eclass (#7859)
* Cranelift: Use a fixpoint loop to compute the best value for each eclass Fixes #7857 * Remove fixpoint loop early-continue optimization * Add document describing optimization rule invariants * Make select optimizations use subsume * Remove invalid debug assert * Remove now-unused methods * Add commutative adds to cost tests
1 parent 2673a40 commit 5b2ae83

File tree

6 files changed

+294
-67
lines changed

6 files changed

+294
-67
lines changed

cranelift/codegen/src/egraph.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,4 +701,5 @@ pub(crate) struct Stats {
701701
pub(crate) elaborate_func: u64,
702702
pub(crate) elaborate_func_pre_insts: u64,
703703
pub(crate) elaborate_func_post_insts: u64,
704+
pub(crate) elaborate_best_cost_fixpoint_iters: u64,
704705
}

cranelift/codegen/src/egraph/cost.rs

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ impl Cost {
7474
const DEPTH_BITS: u8 = 8;
7575
const DEPTH_MASK: u32 = (1 << Self::DEPTH_BITS) - 1;
7676
const OP_COST_MASK: u32 = !Self::DEPTH_MASK;
77-
const MAX_OP_COST: u32 = (Self::OP_COST_MASK >> Self::DEPTH_BITS) - 1;
77+
const MAX_OP_COST: u32 = Self::OP_COST_MASK >> Self::DEPTH_BITS;
7878

7979
pub(crate) fn infinity() -> Cost {
8080
// 2^32 - 1 is, uh, pretty close to infinite... (we use `Cost`
@@ -86,14 +86,16 @@ impl Cost {
8686
Cost(0)
8787
}
8888

89-
/// Construct a new finite cost from the given parts.
89+
/// Construct a new `Cost` from the given parts.
9090
///
91-
/// The opcode cost is clamped to the maximum value representable.
92-
fn new_finite(opcode_cost: u32, depth: u8) -> Cost {
93-
let opcode_cost = std::cmp::min(opcode_cost, Self::MAX_OP_COST);
94-
let cost = Cost((opcode_cost << Self::DEPTH_BITS) | u32::from(depth));
95-
debug_assert_ne!(cost, Cost::infinity());
96-
cost
91+
/// If the opcode cost is greater than or equal to the maximum representable
92+
/// opcode cost, then the resulting `Cost` saturates to infinity.
93+
fn new(opcode_cost: u32, depth: u8) -> Cost {
94+
if opcode_cost >= Self::MAX_OP_COST {
95+
Self::infinity()
96+
} else {
97+
Cost(opcode_cost << Self::DEPTH_BITS | u32::from(depth))
98+
}
9799
}
98100

99101
fn depth(&self) -> u8 {
@@ -111,7 +113,7 @@ impl Cost {
111113
/// that satisfies `inst_predicates::is_pure_for_egraph()`.
112114
pub(crate) fn of_pure_op(op: Opcode, operand_costs: impl IntoIterator<Item = Self>) -> Self {
113115
let c = pure_op_cost(op) + operand_costs.into_iter().sum();
114-
Cost::new_finite(c.op_cost(), c.depth().saturating_add(1))
116+
Cost::new(c.op_cost(), c.depth().saturating_add(1))
115117
}
116118
}
117119

@@ -131,12 +133,9 @@ impl std::ops::Add<Cost> for Cost {
131133
type Output = Cost;
132134

133135
fn add(self, other: Cost) -> Cost {
134-
let op_cost = std::cmp::min(
135-
self.op_cost().saturating_add(other.op_cost()),
136-
Self::MAX_OP_COST,
137-
);
136+
let op_cost = self.op_cost().saturating_add(other.op_cost());
138137
let depth = std::cmp::max(self.depth(), other.depth());
139-
Cost::new_finite(op_cost, depth)
138+
Cost::new(op_cost, depth)
140139
}
141140
}
142141

@@ -147,11 +146,11 @@ impl std::ops::Add<Cost> for Cost {
147146
fn pure_op_cost(op: Opcode) -> Cost {
148147
match op {
149148
// Constants.
150-
Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new_finite(1, 0),
149+
Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1, 0),
151150

152151
// Extends/reduces.
153152
Opcode::Uextend | Opcode::Sextend | Opcode::Ireduce | Opcode::Iconcat | Opcode::Isplit => {
154-
Cost::new_finite(2, 0)
153+
Cost::new(2, 0)
155154
}
156155

157156
// "Simple" arithmetic.
@@ -163,9 +162,52 @@ fn pure_op_cost(op: Opcode) -> Cost {
163162
| Opcode::Bnot
164163
| Opcode::Ishl
165164
| Opcode::Ushr
166-
| Opcode::Sshr => Cost::new_finite(3, 0),
165+
| Opcode::Sshr => Cost::new(3, 0),
167166

168167
// Everything else (pure.)
169-
_ => Cost::new_finite(4, 0),
168+
_ => Cost::new(4, 0),
169+
}
170+
}
171+
172+
#[cfg(test)]
173+
mod tests {
174+
use super::*;
175+
176+
#[test]
177+
fn add_cost() {
178+
let a = Cost::new(5, 2);
179+
let b = Cost::new(37, 3);
180+
assert_eq!(a + b, Cost::new(42, 3));
181+
assert_eq!(b + a, Cost::new(42, 3));
182+
}
183+
184+
#[test]
185+
fn add_infinity() {
186+
let a = Cost::new(5, 2);
187+
let b = Cost::infinity();
188+
assert_eq!(a + b, Cost::infinity());
189+
assert_eq!(b + a, Cost::infinity());
190+
}
191+
192+
#[test]
193+
fn op_cost_saturates_to_infinity() {
194+
let a = Cost::new(Cost::MAX_OP_COST - 10, 2);
195+
let b = Cost::new(11, 2);
196+
assert_eq!(a + b, Cost::infinity());
197+
assert_eq!(b + a, Cost::infinity());
198+
}
199+
200+
#[test]
201+
fn depth_saturates_to_max_depth() {
202+
let a = Cost::new(10, u8::MAX);
203+
let b = Cost::new(10, 1);
204+
assert_eq!(
205+
Cost::of_pure_op(Opcode::Iconst, [a, b]),
206+
Cost::new(21, u8::MAX)
207+
);
208+
assert_eq!(
209+
Cost::of_pure_op(Opcode::Iconst, [b, a]),
210+
Cost::new(21, u8::MAX)
211+
);
170212
}
171213
}

cranelift/codegen/src/egraph/elaborate.rs

Lines changed: 111 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use super::Stats;
77
use crate::dominator_tree::DominatorTree;
88
use crate::fx::{FxHashMap, FxHashSet};
99
use crate::hash_map::Entry as HashEntry;
10+
use crate::inst_predicates::is_pure_for_egraph;
1011
use crate::ir::{Block, Function, Inst, Value, ValueDef};
1112
use crate::loop_analysis::{Loop, LoopAnalysis};
1213
use crate::scoped_hash_map::ScopedHashMap;
@@ -216,46 +217,112 @@ impl<'a> Elaborator<'a> {
216217

217218
fn compute_best_values(&mut self) {
218219
let best = &mut self.value_to_best_value;
219-
for (value, def) in self.func.dfg.values_and_defs() {
220-
trace!("computing best for value {:?} def {:?}", value, def);
221-
match def {
222-
ValueDef::Union(x, y) => {
223-
// Pick the best of the two options based on
224-
// min-cost. This works because each element of `best`
225-
// is a `(cost, value)` tuple; `cost` comes first so
226-
// the natural comparison works based on cost, and
227-
// breaks ties based on value number.
228-
trace!(" -> best of {:?} and {:?}", best[x], best[y]);
229-
best[value] = std::cmp::min(best[x], best[y]);
230-
trace!(" -> {:?}", best[value]);
231-
}
232-
ValueDef::Param(_, _) => {
233-
best[value] = BestEntry(Cost::zero(), value);
234-
}
235-
// If the Inst is inserted into the layout (which is,
236-
// at this point, only the side-effecting skeleton),
237-
// then it must be computed and thus we give it zero
238-
// cost.
239-
ValueDef::Result(inst, _) => {
240-
if let Some(_) = self.func.layout.inst_block(inst) {
241-
best[value] = BestEntry(Cost::zero(), value);
242-
} else {
243-
trace!(" -> value {}: result, computing cost", value);
244-
let inst_data = &self.func.dfg.insts[inst];
245-
// N.B.: at this point we know that the opcode is
246-
// pure, so `pure_op_cost`'s precondition is
247-
// satisfied.
248-
let cost = Cost::of_pure_op(
249-
inst_data.opcode(),
250-
self.func.dfg.inst_values(inst).map(|value| best[value].0),
220+
221+
// Do a fixpoint loop to compute the best value for each eclass.
222+
//
223+
// The maximum number of iterations is the length of the longest chain
224+
// of `vNN -> vMM` edges in the dataflow graph where `NN < MM`, so this
225+
// is *technically* quadratic, but `cranelift-frontend` won't construct
226+
// any such edges. NaN canonicalization will introduce some of these
227+
// edges, but they are chains of only two or three edges. So in
228+
// practice, we *never* do more than a handful of iterations here unless
229+
// (a) we parsed the CLIF from text and the text was funkily numbered,
230+
// which we don't really care about, or (b) the CLIF producer did
231+
// something weird, in which case it is their responsibility to stop
232+
// doing that.
233+
trace!("Entering fixpoint loop to compute the best values for each eclass");
234+
let mut keep_going = true;
235+
while keep_going {
236+
keep_going = false;
237+
trace!(
238+
"fixpoint iteration {}",
239+
self.stats.elaborate_best_cost_fixpoint_iters
240+
);
241+
self.stats.elaborate_best_cost_fixpoint_iters += 1;
242+
243+
for (value, def) in self.func.dfg.values_and_defs() {
244+
trace!("computing best for value {:?} def {:?}", value, def);
245+
let orig_best_value = best[value];
246+
247+
match def {
248+
ValueDef::Union(x, y) => {
249+
// Pick the best of the two options based on
250+
// min-cost. This works because each element of `best`
251+
// is a `(cost, value)` tuple; `cost` comes first so
252+
// the natural comparison works based on cost, and
253+
// breaks ties based on value number.
254+
best[value] = std::cmp::min(best[x], best[y]);
255+
trace!(
256+
" -> best of union({:?}, {:?}) = {:?}",
257+
best[x],
258+
best[y],
259+
best[value]
251260
);
252-
best[value] = BestEntry(cost, value);
253261
}
254-
}
255-
};
256-
debug_assert_ne!(best[value].0, Cost::infinity());
257-
debug_assert_ne!(best[value].1, Value::reserved_value());
258-
trace!("best for eclass {:?}: {:?}", value, best[value]);
262+
ValueDef::Param(_, _) => {
263+
best[value] = BestEntry(Cost::zero(), value);
264+
}
265+
// If the Inst is inserted into the layout (which is,
266+
// at this point, only the side-effecting skeleton),
267+
// then it must be computed and thus we give it zero
268+
// cost.
269+
ValueDef::Result(inst, _) => {
270+
if let Some(_) = self.func.layout.inst_block(inst) {
271+
best[value] = BestEntry(Cost::zero(), value);
272+
} else {
273+
let inst_data = &self.func.dfg.insts[inst];
274+
// N.B.: at this point we know that the opcode is
275+
// pure, so `pure_op_cost`'s precondition is
276+
// satisfied.
277+
let cost = Cost::of_pure_op(
278+
inst_data.opcode(),
279+
self.func.dfg.inst_values(inst).map(|value| best[value].0),
280+
);
281+
best[value] = BestEntry(cost, value);
282+
trace!(" -> cost of value {} = {:?}", value, cost);
283+
}
284+
}
285+
};
286+
287+
// Keep on iterating the fixpoint loop while we are finding new
288+
// best values.
289+
keep_going |= orig_best_value != best[value];
290+
}
291+
}
292+
293+
if cfg!(any(feature = "trace-log", debug_assertions)) {
294+
trace!("finished fixpoint loop to compute best value for each eclass");
295+
for value in self.func.dfg.values() {
296+
trace!("-> best for eclass {:?}: {:?}", value, best[value]);
297+
debug_assert_ne!(best[value].1, Value::reserved_value());
298+
// You might additionally be expecting an assert that the best
299+
// cost is not infinity, however infinite cost *can* happen in
300+
// practice. First, note that our cost function doesn't know
301+
// about any shared structure in the dataflow graph, it only
302+
// sums operand costs. (And trying to avoid that by deduping a
303+
// single operation's operands is a losing game because you can
304+
// always just add one indirection and go from `add(x, x)` to
305+
// `add(foo(x), bar(x))` to hide the shared structure.) Given
306+
// that blindness to sharing, we can make cost grow
307+
// exponentially with a linear sequence of operations:
308+
//
309+
// v0 = iconst.i32 1 ;; cost = 1
310+
// v1 = iadd v0, v0 ;; cost = 3 + 1 + 1
311+
// v2 = iadd v1, v1 ;; cost = 3 + 5 + 5
312+
// v3 = iadd v2, v2 ;; cost = 3 + 13 + 13
313+
// v4 = iadd v3, v3 ;; cost = 3 + 29 + 29
314+
// v5 = iadd v4, v4 ;; cost = 3 + 61 + 61
315+
// v6 = iadd v5, v5 ;; cost = 3 + 125 + 125
316+
// ;; etc...
317+
//
318+
// Such a chain can cause cost to saturate to infinity. How do
319+
// we choose which e-node is best when there are multiple that
320+
// have saturated to infinity? It doesn't matter. As long as
321+
// invariant (2) for optimization rules is upheld by our rule
322+
// set (see `cranelift/codegen/src/opts/README.md`) it is safe
323+
// to choose *any* e-node in the e-class. At worst we will
324+
// produce suboptimal code, but never an incorrectness.
325+
}
259326
}
260327
}
261328

@@ -606,7 +673,13 @@ impl<'a> Elaborator<'a> {
606673
}
607674
inst
608675
};
676+
609677
// Place the inst just before `before`.
678+
debug_assert!(
679+
is_pure_for_egraph(self.func, inst),
680+
"something has gone very wrong if we are elaborating effectful \
681+
instructions, they should have remained in the skeleton"
682+
);
610683
self.func.layout.insert_inst(inst, before);
611684

612685
// Update the inst's arguments.
Lines changed: 81 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,81 @@
1-
Rules here are allowed to rewrite pure expressions arbitrarily,
2-
using the same inputs as the original, or fewer. In other words, we
3-
cannot pull a new eclass id out of thin air and refer to it, other
4-
than a piece of the input or a new node that we construct; but we
5-
can freely rewrite e.g. `x+y-y` to `x`.
1+
# Rules for Writing Optimization Rules
2+
3+
For both correctness and compile speed, we must be careful with our rules. A lot
4+
of it boils down to the fact that, unlike traditional e-graphs, our rules are
5+
*directional*.
6+
7+
1. Rules should not rewrite to worse code: the right-hand side should be at
8+
least as good as the left-hand side or better.
9+
10+
For example, the rule
11+
12+
x => (add x 0)
13+
14+
is disallowed, but swapping its left- and right-hand sides produces a rule
15+
that is allowed.
16+
17+
Any kind of canonicalizing rule that intends to help subsequent rules match
18+
and unlock further optimizations (e.g. floating constants to the right side
19+
for our constant-propagation rules to match) must produce canonicalized
20+
output that is no worse than its noncanonical input.
21+
22+
We assume this invariant as a heuristic to break ties between two
23+
otherwise-equal-cost expressions in various places, making up for some
24+
limitations of our explicit cost function.
25+
26+
2. Any rule that removes value-uses in its right-hand side that previously
27+
existed in its left-hand side MUST use `subsume`.
28+
29+
For example, the rule
30+
31+
(select 1 x y) => x
32+
33+
MUST use `subsume`.
34+
35+
This is required for correctness because, once a value-use is removed, some
36+
e-nodes in the e-class are more equal than others. There might be uses of `x`
37+
in a scope where `y` is not available, and so emitting `(select 1 x y)` in
38+
place of `x` in such cases would introduce uses of `y` where it is not
39+
defined.
40+
41+
3. Avoid overly general rewrites like commutativity and associativity. Instead,
42+
prefer targeted instances of the rewrite (for example, canonicalizing adds
43+
where one operand is a constant such that the constant is always the add's
44+
second operand, rather than general commutativity for adds) or even writing
45+
the "same" optimization rule multiple times.
46+
47+
For example, the commutativity in the first rule in the following snippet is
48+
bad because it will match even when the first operand is not an add:
49+
50+
;; Commute to allow `(foo (add ...) x)`, when we see it, to match.
51+
(foo x y) => (foo y x)
52+
53+
;; Optimize.
54+
(foo x (add ...)) => (bar x)
55+
56+
Better is to commute only when we know that canonicalizing in this way will
57+
all definitely allow the subsequent optimization rule to match:
58+
59+
;; Canonicalize all adds to `foo`'s second operand.
60+
(foo (add ...) x) => (foo x (add ...))
61+
62+
;; Optimize.
63+
(foo x (add ...)) => (bar x)
64+
65+
But even better in this case is to write the "same" optimization multiple
66+
times:
67+
68+
(foo (add ...) x) => (bar x)
69+
(foo x (add ...)) => (bar x)
70+
71+
The cost of rule-matching is amortized by the ISLE compiler, where as the
72+
intermediate result of each rewrite allocates new e-nodes and requires
73+
storage in the dataflow graph. Therefore, additional rules are cheaper than
74+
additional e-nodes.
75+
76+
Commutativity and associativity in particular can cause huge amounts of
77+
e-graph bloat.
78+
79+
One day we intend to extend ISLE with built-in support for commutativity, so
80+
we don't need to author the redundant commutations ourselves:
81+
https://github.com/bytecodealliance/wasmtime/issues/6128

0 commit comments

Comments
 (0)