8585// compute the LUB of a bunch of calls to a target and then investigate
8686// that one case and use it in all those callers.
8787// TODO: Not just direct calls? But updating vtables is complex.
88+ // TODO: Should we look at no-inline flags? We do move code between functions,
89+ // but it isn't normal inlining.
8890//
8991
9092#include " ir/cost.h"
93+ #include " ir/effects.h"
9194#include " ir/find_all.h"
95+ #include " ir/iteration.h"
9296#include " ir/manipulation.h"
9397#include " ir/module-utils.h"
9498#include " ir/names.h"
99+ #include " ir/properties.h"
95100#include " ir/return-utils.h"
96101#include " ir/type-updating.h"
97102#include " ir/utils.h"
@@ -212,33 +217,204 @@ struct CallContext {
212217 // remaining values by updating |newOperands| (for example, if all the values
213218 // sent are constants, then |newOperands| will end up empty, as we have
214219 // nothing left to send).
220+ //
221+ // The approach implemented here tries to move as much code into the call
222+ // context as possible. That may not always be helpful, say in situations like
223+ // this:
224+ //
225+ // (call $foo
226+ // (i32.add
227+ // (local.get $x)
228+ // (local.get $y)
229+ // )
230+ // )
231+ //
232+ // If we move the i32.add into $foo then it will still be adding two unknown
233+ // values (which will be parameters rather than locals). Moving the add might
234+ // just increase code size if so. However, there are many other situations
235+ // where the more code, the better:
236+ //
237+ // (call $foo
238+ // (i32.eqz
239+ // (local.get $x)
240+ // )
241+ // )
242+ //
243+ // While the value remains unknown, after moving the i32.eqz into the target
244+ // function we may be able to use the fact that it has at most 1 bit set.
245+ // Even larger benefits can happen in WasmGC:
246+ //
247+ // (call $foo
248+ // (struct.new $T
249+ // (local.get $x)
250+ // (local.get $y)
251+ // )
252+ // )
253+ //
254+ // If the struct never escapes then we may be able to remove the allocation
255+ // after monomorphization, even if we know nothing about the values in its
256+ // fields.
257+ //
258+ // TODO: Explore other options that are more careful about how much code is
259+ // moved.
215260 void buildFromCall (CallInfo& info,
216261 std::vector<Expression*>& newOperands,
217- Module& wasm) {
262+ Module& wasm,
263+ const PassOptions& options) {
218264 Builder builder (wasm);
219265
266+ // First, find the things we can move into the context and the things we
267+ // cannot. Some things simply cannot be moved out of the calling function,
268+ // such as a local.set, but also we need to handle effect interactions among
269+ // the operands, because each time we move code into the context we are
270+ // pushing it into the called function, which changes the order of
271+ // operations, for example:
272+ //
273+ // (call $foo
274+ // (first
275+ // (a)
276+ // )
277+ // (second
278+ // (b)
279+ // )
280+ // )
281+ //
282+ // (func $foo (param $first) (param $second)
283+ // )
284+ //
285+ // If we move |first| and |a| into the context then we get this:
286+ //
287+ // (call $foo
288+ // ;; |first| and |a| were removed from here.
289+ // (second
290+ // (b)
291+ // )
292+ // )
293+ //
294+ // (func $foo (param $second)
295+ // ;; |first| is now a local, and we assign it inside the called func.
296+ // (local $first)
297+ // (local.set $first
298+ // (first
299+ // (a)
300+ // )
301+ // )
302+ // )
303+ //
304+ // After this code motion we execute |second| and |b| *before* the call, and
305+ // |first| and |a| after, so we cannot do this transformation if the order
306+ // of operations between them matters.
307+ //
308+ // The key property here is that all things that are moved into the context
309+ // (moved into the monomorphized function) remain ordered with respect to
310+ // each other, but must be moved past all non-moving things after them. For
311+ // example, say we want to move B and D in this list (of expressions in
312+ // execution order):
313+ //
314+ // A, B, C, D, E
315+ //
316+ // After moving B and D we end up with this:
317+ //
318+ // A, C, E and executing later in the monomorphized function: B, D
319+ //
320+ // Then we must be able to move B past C and E, and D past E. It is simplest
321+ // to compute this in reverse order, starting from E and going back, and
322+ // then each time we want to move something we can check if it can cross
323+ // over all the non-moving effects we've seen so far. To compute this, first
324+ // list out the post-order of the expressions, and then we'll iterate in
325+ // reverse.
326+ struct Lister
327+ : public PostWalker<Lister, UnifiedExpressionVisitor<Lister>> {
328+ std::vector<Expression*> list;
329+ void visitExpression (Expression* curr) { list.push_back (curr); }
330+ } lister;
331+ // As a quick estimate, we need space for at least the operands.
332+ lister.list .reserve (operands.size ());
333+
334+ for (auto * operand : info.call ->operands ) {
335+ lister.walk (operand);
336+ }
337+
338+ // Go in reverse post-order as explained earlier, noting what cannot be
339+ // moved into the context, and while accumulating the effects that are not
340+ // moving.
341+ std::unordered_set<Expression*> immovable;
342+ EffectAnalyzer nonMovingEffects (options, wasm);
343+ for (auto i = int64_t (lister.list .size ()) - 1 ; i >= 0 ; i--) {
344+ auto * curr = lister.list [i];
345+
346+ // This may have been marked as immovable because of the parent. We do
347+ // that because if a parent is immovable then we can't move the children
348+ // into the context (if we did, they would execute after the parent, but
349+ // it needs their values).
350+ bool currImmovable = immovable.count (curr) > 0 ;
351+ if (!currImmovable) {
352+ // This might be movable or immovable. Check both effect interactions
353+ // (as described before, we want to move this past immovable code) and
354+ // reasons intrinsic to the expression itself that might prevent moving.
355+ ShallowEffectAnalyzer currEffects (options, wasm, curr);
356+ if (currEffects.invalidates (nonMovingEffects) ||
357+ !canBeMovedIntoContext (curr, currEffects)) {
358+ immovable.insert (curr);
359+ currImmovable = true ;
360+ }
361+ }
362+
363+ if (currImmovable) {
364+ // Regardless of whether this was marked immovable because of the
365+ // parent, or because we just found it cannot be moved, accumulate the
366+ // effects, and also mark its immediate children (so that we do the same
367+ // when we get to them).
368+ nonMovingEffects.visit (curr);
369+ for (auto * child : ChildIterator (curr)) {
370+ immovable.insert (child);
371+ }
372+ }
373+ }
374+
375+ // We now know which code can be moved and which cannot, so we can do the
376+ // final processing of the call operands. We do this as a copy operation,
377+ // copying as much as possible into the call context. Code that cannot be
378+ // moved ends up as values sent to the monomorphized function.
379+ //
380+ // The copy operation works in pre-order, which allows us to override
381+ // entire children as needed:
382+ //
383+ // (call $foo
384+ // (problem
385+ // (a)
386+ // )
387+ // (later)
388+ // )
389+ //
390+ // We visit |problem| first, and if there is a problem that prevents us
391+ // moving it into the context then we override the copy and then it and
392+ // its child |a| remain in the caller (and |a| is never visited in the
393+ // copy).
220394 for (auto * operand : info.call ->operands ) {
221- // Process the operand. This is a copy operation, as we are trying to move
222- // (copy) code from the callsite into the called function. When we find we
223- // can copy then we do so, and when we cannot that value remains as a
224- // value sent from the call.
225395 operands.push_back (ExpressionManipulator::flexibleCopy (
226396 operand, wasm, [&](Expression* child) -> Expression* {
227- if (canBeMovedIntoContext (child)) {
228- // This can be moved, great: let the copy happen.
397+ if (!child) {
398+ // This is an optional child that is not present. Let the copy of
399+ // the nullptr happen.
400+ return nullptr ;
401+ }
402+
403+ if (!immovable.count (child)) {
404+ // This can be moved; let the copy happen.
229405 return nullptr ;
230406 }
231407
232- // This cannot be moved, so we stop here: this is a value that is sent
233- // into the monomorphized function. It is a new operand in the call,
234- // and in the context operands it is a local.get, that reads that
235- // value.
408+ // This cannot be moved. Do not copy it into the call context. In the
409+ // example above, |problem| remains as an operand on the call (so we
410+ // add it to |newOperands|), and in the call context all we have is a
411+ // local.get that reads that sent value.
236412 auto paramIndex = newOperands.size ();
237413 newOperands.push_back (child);
238414 // TODO: If one operand is a tee and another a get, we could actually
239415 // reuse the local, effectively showing the monomorphized
240- // function that the values are the same. (But then the checks
241- // later down to is<LocalGet> would need to check index too.)
416+ // function that the values are the same. EquivalentSets may
417+ // help here.
242418 return builder.makeLocalGet (paramIndex, child->type );
243419 }));
244420 }
@@ -247,12 +423,49 @@ struct CallContext {
247423 }
248424
249425 // Checks whether an expression can be moved into the context.
250- bool canBeMovedIntoContext (Expression* curr) {
251- // Constant numbers, funcs, strings, etc. can all be copied, so it is ok to
252- // add them to the context.
253- // TODO: Allow global.get as well, and anything else that is purely
254- // copyable.
255- return Properties::isSingleConstantExpression (curr);
426+ bool canBeMovedIntoContext (Expression* curr,
427+ const ShallowEffectAnalyzer& effects) {
428+ // Pretty much everything can be moved into the context if we can copy it
429+ // between functions, such as constants, globals, etc. The things we cannot
430+ // copy are now checked for.
431+ if (effects.branchesOut || effects.hasExternalBreakTargets ()) {
432+ // This branches or returns. We can't move control flow between functions.
433+ return false ;
434+ }
435+ if (effects.accessesLocal ()) {
436+ // Reads/writes to local state cannot be moved around.
437+ return false ;
438+ }
439+ if (effects.calls ) {
440+ // We can in principle move calls, but for simplicity we avoid such
441+ // situations (which might involve recursion etc.).
442+ return false ;
443+ }
444+ if (Properties::isControlFlowStructure (curr)) {
445+ // We can in principle move entire control flow structures with their
446+ // children, but for simplicity stop when we see one rather than look
447+ // inside to see if we could transfer all its contents. (We would also
448+ // need to be careful when handling If arms, etc.)
449+ return false ;
450+ }
451+ for (auto * child : ChildIterator (curr)) {
452+ if (child->type .isTuple ()) {
453+ // Consider this:
454+ //
455+ // (call $target
456+ // (tuple.extract 2 1
457+ // (local.get $tuple)
458+ // )
459+ // )
460+ //
461+ // We cannot move the tuple.extract into the context, because then the
462+ // call would have a tuple param. While it is possible to split up the
463+ // tuple, or to check if we can also move the children with the parent,
464+ // for simplicity just ignore this rare situation.
465+ return false ;
466+ }
467+ }
468+ return true ;
256469 }
257470
258471 // Check if a context is trivial relative to a call, that is, the context
@@ -389,7 +602,7 @@ struct Monomorphize : public Pass {
389602 // if we use that context.
390603 CallContext context;
391604 std::vector<Expression*> newOperands;
392- context.buildFromCall (info, newOperands, wasm);
605+ context.buildFromCall (info, newOperands, wasm, getPassOptions () );
393606
394607 // See if we've already evaluated this function + call context. If so, then
395608 // we've memoized the result.
@@ -447,8 +660,22 @@ struct Monomorphize : public Pass {
447660 doOpts (func);
448661 doOpts (monoFunc.get ());
449662
663+ // The cost before monomorphization is the old body + the context
664+ // operands. The operands will be *removed* from the calling code if we
665+ // optimize, and moved into the monomorphized function, so the proper
666+ // comparison is the context + the old body, versus the new body (which
667+ // includes the reverse-inlined call context).
450668 auto costBefore = CostAnalyzer (func->body ).cost ;
669+ for (auto * operand : context.operands ) {
670+ // Note that a slight oddity is that we have *not* optimized the
671+ // operands before. We optimize func before and after, but the operands
672+ // are in the calling function, which we are not modifying here. In
673+ // theory that might lead to false positives, if the call's operands are
674+ // very unoptimized.
675+ costBefore += CostAnalyzer (operand).cost ;
676+ }
451677 auto costAfter = CostAnalyzer (monoFunc->body ).cost ;
678+
452679 // TODO: We should probably only accept improvements above some minimum,
453680 // to avoid optimizing cases where we duplicate a huge function but
454681 // only optimize a tiny part of it compared to the original.
@@ -486,22 +713,27 @@ struct Monomorphize : public Pass {
486713 // Copy the function as the base for the new one.
487714 auto newFunc = ModuleUtils::copyFunctionWithoutAdd (func, wasm, newName);
488715
489- // Generate the new signature, and apply it to the new function.
716+ // A local.get is a value that arrives in a parameter. Anything else is
717+ // something that we are reverse-inlining into the function, so we don't
718+ // need a param for it. Note that we might have multiple gets nested here,
719+ // if we are copying part of the original parameter but not all children, so
720+ // we scan each operand for all such local.gets.
721+ //
722+ // Use this information to generate the new signature, and apply it to the
723+ // new function.
490724 std::vector<Type> newParams;
491725 for (auto * operand : context.operands ) {
492- // A local.get is a value that arrives in a parameter. Anything else is
493- // something that we are reverse-inlining into the function, so we don't
494- // need a param for it.
495- if (operand->is <LocalGet>()) {
496- newParams.push_back (operand->type );
726+ FindAll<LocalGet> gets (operand);
727+ for (auto * get : gets.list ) {
728+ newParams.push_back (get->type );
497729 }
498730 }
499731 // If we were dropped then we are pulling the drop into the monomorphized
500732 // function, which means we return nothing.
501733 auto newResults = context.dropped ? Type::none : func->getResults ();
502734 newFunc->type = Signature (Type (newParams), newResults);
503735
504- // We must update local indexes: the new function has a potentially
736+ // We must update local indexes: the new function has a potentially
505737 // different number of parameters, and parameters are at the very bottom of
506738 // the local index space. We are also replacing old params with vars. To
507739 // track this, map each old index to the new one.
@@ -559,7 +791,7 @@ struct Monomorphize : public Pass {
559791 // (local.get $param) ;; copied old body
560792 // )
561793 //
562- // We need to add such an local.set in the prelude of the function for each
794+ // We need to add such a local.set in the prelude of the function for each
563795 // operand in the context.
564796 std::vector<Expression*> pre ;
565797 for (Index i = 0 ; i < context.operands .size (); i++) {
0 commit comments