2727// looked at.
2828//
2929
30- #include < ir/literal-utils.h>
31- #include < ir/local-graph.h>
32- #include < ir/manipulation.h>
33- #include < ir/properties.h>
34- #include < ir/utils.h>
35- #include < pass.h>
36- #include < support/unique_deferring_queue.h>
37- #include < wasm-builder.h>
38- #include < wasm-interpreter.h>
39- #include < wasm.h>
30+ #include " ir/effects.h"
31+ #include " ir/iteration.h"
32+ #include " ir/literal-utils.h"
33+ #include " ir/local-graph.h"
34+ #include " ir/manipulation.h"
35+ #include " ir/properties.h"
36+ #include " ir/utils.h"
37+ #include " pass.h"
38+ #include " support/insert_ordered.h"
39+ #include " support/unique_deferring_queue.h"
40+ #include " wasm-builder.h"
41+ #include " wasm-interpreter.h"
42+ #include " wasm.h"
4043
4144namespace wasm {
4245
@@ -210,9 +213,16 @@ struct Precompute
210213 GetValues getValues;
211214 HeapValues heapValues;
212215
216+ bool canPartiallyPrecompute;
217+
213218 void doWalkFunction (Function* func) {
219+ // Perform partial precomputing only when the optimization level is non-
220+ // trivial, as it is slower and less likely to help.
221+ canPartiallyPrecompute = getPassOptions ().optimizeLevel >= 2 ;
222+
214223 // Walk the function and precompute things.
215224 super::doWalkFunction (func);
225+ partiallyPrecompute (func);
216226 if (!propagate) {
217227 return ;
218228 }
@@ -226,11 +236,13 @@ struct Precompute
226236 // another walk to apply them and perhaps other optimizations that are
227237 // unlocked.
228238 super::doWalkFunction (func);
239+ // We could also try to partially precompute again, but that is a somewhat
240+ // heavy operation, so we only do it the first time, and leave such things
241+ // for later runs of this pass and for --converge.
229242 }
230243 // Note that in principle even more cycles could find further work here, in
231244 // very rare cases. To avoid constructing a LocalGraph again just for that
232- // unlikely chance, we leave such things for later runs of this pass and for
233- // --converge.
245+ // unlikely chance, we leave such things for later.
234246 }
235247
236248 template <typename T> void reuseConstantNode (T* curr, Flow flow) {
@@ -281,6 +293,9 @@ struct Precompute
281293 }
282294 if (flow.breaking ()) {
283295 if (flow.breakTo == NONCONSTANT_FLOW) {
296+ // This cannot be turned into a constant, but perhaps we can partially
297+ // precompute it.
298+ considerPartiallyPrecomputing (curr);
284299 return ;
285300 }
286301 if (flow.breakTo == RETURN_FLOW) {
@@ -319,6 +334,273 @@ struct Precompute
319334 }
320335 }
321336
337+ // If we failed to precompute a constant, perhaps we can still precompute part
338+ // of an expression. Specifically, consider this case:
339+ //
340+ // (A
341+ // (select
342+ // (B)
343+ // (C)
344+ // (condition)
345+ // )
346+ // )
347+ //
348+ // Perhaps we can compute A(B) and A(C). If so, we can emit a better select:
349+ //
350+ // (select
351+ // (constant result of A(B))
352+ // (constant result of A(C))
353+ // (condition)
354+ // )
355+ //
356+ // Note that in general for code size we want to move operations *out* of
357+ // selects and ifs (OptimizeInstructions does that), but here we are
358+ // computing two constants which replace three expressions, so it is
359+ // worthwhile.
360+ //
361+ // To do such partial precomputing, in the main pass we note selects that look
362+ // promising. If we find any then we do a second pass later just for that (as
363+ // doing so requires walking up the stack in a manner that we want to avoid in
364+ // the main pass for overhead reasons; see below).
365+ //
366+ // Note that selects are all we really need here: Other passes would turn an
367+ // if into a select if the arms are simple enough, and only in those cases
368+ // (simple arms) do we have a chance at partially precomputing. For example,
369+ // if an arm is a constant then we can, but if it is a call then we can't.)
370+ // However, there are cases like an if with arms with side effects that end in
371+ // precomputable things, that are missed atm TODO
372+ std::unordered_set<Select*> partiallyPrecomputable;
373+
374+ void considerPartiallyPrecomputing (Expression* curr) {
375+ if (!canPartiallyPrecompute) {
376+ return ;
377+ }
378+
379+ if (auto * select = curr->dynCast <Select>()) {
380+ // We only have a reasonable hope of success if the select arms are things
381+ // like constants or global gets. At a first approximation, allow the set
382+ // of things we allow in constant initializers (but we can probably allow
383+ // more here TODO).
384+ //
385+ // We also ignore selects with no parent (that are the entire function
386+ // body) as then there is nothing to optimize into their arms.
387+ auto & wasm = *getModule ();
388+ if (Properties::isValidConstantExpression (wasm, select->ifTrue ) &&
389+ Properties::isValidConstantExpression (wasm, select->ifFalse ) &&
390+ getFunction ()->body != select) {
391+ partiallyPrecomputable.insert (select);
392+ }
393+ }
394+ }
395+
396+ // To partially precompute selects we walk up the stack from them, like this:
397+ //
398+ // (A
399+ // (B
400+ // (select
401+ // (C)
402+ // (D)
403+ // (condition)
404+ // )
405+ // )
406+ // )
407+ //
408+ // First we try to apply B to C and D. If that works, we arrive at this:
409+ //
410+ // (A
411+ // (select
412+ // (constant result of B(C))
413+ // (constant result of B(D))
414+ // (condition)
415+ // )
416+ // )
417+ //
418+ // We can then proceed to perhaps apply A. However, even if we failed to apply
419+ // B then we can try to apply A and B together, because that combination may
420+ // succeed where incremental work fails, for example:
421+ //
422+ // (global $C
423+ // (struct.new ;; outer
424+ // (struct.new ;; inner
425+ // (i32.const 10)
426+ // )
427+ // )
428+ // )
429+ //
430+ // (struct.get ;; outer
431+ // (struct.get ;; inner
432+ // (select
433+ // (global.get $C)
434+ // (global.get $D)
435+ // (condition)
436+ // )
437+ // )
438+ // )
439+ //
440+ // Applying the inner struct.get to $C leads us to the inner struct.new, but
441+ // that is an interior pointer in the global - it is not something we can
442+ // refer to using a global.get, so precomputing it fails. However, when we
443+ // apply both struct.gets at once we arrive at the outer struct.new, which is
444+ // in fact the global $C, and we succeed.
445+ void partiallyPrecompute (Function* func) {
446+ if (!canPartiallyPrecompute || partiallyPrecomputable.empty ()) {
447+ // Nothing to do.
448+ return ;
449+ }
450+
451+ // Walk the function to find the parent stacks of the promising selects. We
452+ // copy the stacks and process them later. We do it like this because if we
453+ // wanted to process stacks as we reached them then we'd trip over
454+ // ourselves: when we optimize we replace a parent, but that parent is an
455+ // expression we'll reach later in the walk, so modifying it is unsafe.
456+ struct StackFinder : public ExpressionStackWalker <StackFinder> {
457+ Precompute& parent;
458+
459+ StackFinder (Precompute& parent) : parent(parent) {}
460+
461+ // We will later iterate on this in the order of insertion, which keeps
462+ // things deterministic, and also usually lets us do consecutive work
463+ // like a select nested in another select's condition, simply because we
464+ // will traverse the selects in postorder (however, because we cannot
465+ // always succeed in an incremental manner - see the comment on this
466+ // function - it is possible in theory that some work can happen only in a
467+ // later execution of the pass).
468+ InsertOrderedMap<Select*, ExpressionStack> stackMap;
469+
470+ void visitSelect (Select* curr) {
471+ if (parent.partiallyPrecomputable .count (curr)) {
472+ stackMap[curr] = expressionStack;
473+ }
474+ }
475+ } stackFinder (*this );
476+ stackFinder.walkFunction (func);
477+
478+ // Note which expressions we've modified as we go, as it is invalid to
479+ // modify more than once. This could happen in theory in a situation like
480+ // this:
481+ //
482+ // (ternary.f32.max ;; fictional instruction for explanatory purposes
483+ // (select ..)
484+ // (select ..)
485+ // (f32.infinity)
486+ // )
487+ //
488+ // When we consider the first select we can see that the computation result
489+ // is always infinity, so we can optimize here and replace the ternary. Then
490+ // the same thing happens with the second select, causing the ternary to be
491+ // replaced again, which is unsafe because it no longer exists after we
492+ // precomputed it the first time. (Note that in this example the result is
493+ // the same either way, but at least in theory an instruction could exist
494+ // for whom there was a difference.) In practice it does not seem that wasm
495+ // has instructions capable of this atm but this code is still useful to
496+ // guard against future problems, and as a minor speedup (quickly skip code
497+ // if it was already modified).
498+ std::unordered_set<Expression*> modified;
499+
500+ for (auto & [select, stack] : stackFinder.stackMap ) {
501+ // Each stack ends in the select itself, and contains more than the select
502+ // itself (otherwise we'd have ignored the select), i.e., the select has a
503+ // parent that we can try to optimize into the arms.
504+ assert (stack.back () == select);
505+ assert (stack.size () >= 2 );
506+ Index selectIndex = stack.size () - 1 ;
507+ assert (selectIndex >= 1 );
508+
509+ if (modified.count (select)) {
510+ // This select was modified; go to the next one.
511+ continue ;
512+ }
513+
514+ // Go up through the parents, until we can't do any more work. At each
515+ // parent we'll try to execute it and all intermediate parents into the
516+ // select arms.
517+ for (Index parentIndex = selectIndex - 1 ; parentIndex != Index (-1 );
518+ parentIndex--) {
519+ auto * parent = stack[parentIndex];
520+ if (modified.count (parent)) {
521+ // This parent was modified; exit the loop on parents as no upper
522+ // parent is valid to try either.
523+ break ;
524+ }
525+
526+ // If the parent lacks a concrete type then we can't move it into the
527+ // select: the select needs a concrete (and non-tuple) type. For example
528+ // if the parent is a drop or is unreachable, those are things we don't
529+ // want to handle, and we stop here (once we see one such parent we
530+ // can't expect to make any more progress).
531+ if (!parent->type .isConcrete () || parent->type .isTuple ()) {
532+ break ;
533+ }
534+
535+ // We are precomputing the select arms, but leaving the condition as-is.
536+ // If the condition breaks to the parent, then we can't move the parent
537+ // into the select arms:
538+ //
539+ // (block $name ;; this must stay outside of the select
540+ // (select
541+ // (B)
542+ // (C)
543+ // (block ;; condition
544+ // (br_if $target
545+ //
546+ // Ignore all control flow for simplicity, as they aren't interesting
547+ // for us, and other passes should have removed them anyhow.
548+ if (Properties::isControlFlowStructure (parent)) {
549+ break ;
550+ }
551+
552+ // This looks promising, so try to precompute here. What we do is
553+ // precompute twice, once with the select replaced with the left arm,
554+ // and once with the right. If both succeed then we can create a new
555+ // select (with the same condition as before) whose arms are the
556+ // precomputed values.
557+ auto isValidPrecomputation = [&](const Flow& flow) {
558+ // For now we handle simple concrete values. We could also handle
559+ // breaks in principle TODO
560+ return canEmitConstantFor (flow.values ) && !flow.breaking () &&
561+ flow.values .isConcrete ();
562+ };
563+
564+ // Find the pointer to the select in its immediate parent so that we can
565+ // replace it first with one arm and then the other.
566+ auto ** pointerToSelect =
567+ getChildPointerInImmediateParent (stack, selectIndex, func);
568+ *pointerToSelect = select->ifTrue ;
569+ auto ifTrue = precomputeExpression (parent);
570+ if (isValidPrecomputation (ifTrue)) {
571+ *pointerToSelect = select->ifFalse ;
572+ auto ifFalse = precomputeExpression (parent);
573+ if (isValidPrecomputation (ifFalse)) {
574+ // Wonderful, we can precompute here! The select can now contain the
575+ // computed values in its arms.
576+ select->ifTrue = ifTrue.getConstExpression (*getModule ());
577+ select->ifFalse = ifFalse.getConstExpression (*getModule ());
578+ select->finalize ();
579+
580+ // The parent of the select is now replaced by the select.
581+ auto ** pointerToParent =
582+ getChildPointerInImmediateParent (stack, parentIndex, func);
583+ *pointerToParent = select;
584+
585+ // Update state for further iterations: Mark everything modified and
586+ // move the select to the parent's location.
587+ for (Index i = parentIndex; i <= selectIndex; i++) {
588+ modified.insert (stack[i]);
589+ }
590+ selectIndex = parentIndex;
591+ stack[selectIndex] = select;
592+ stack.resize (selectIndex + 1 );
593+ }
594+ }
595+
596+ // Whether we succeeded to precompute here or not, restore the parent's
597+ // pointer to its original state (if we precomputed, the parent is no
598+ // longer in use, but there is no harm in modifying it).
599+ *pointerToSelect = select;
600+ }
601+ }
602+ }
603+
322604 void visitFunction (Function* curr) {
323605 // removing breaks can alter types
324606 ReFinalize ().walkFunctionInModule (curr, getModule ());
@@ -531,6 +813,30 @@ struct Precompute
531813
532814 return true ;
533815 }
816+
817+ // Helpers for partial precomputing.
818+
819+ // Given a stack of expressions and the index of an expression in it, find
820+ // the pointer to that expression in the parent. This gives us a pointer that
821+ // allows us to replace the expression.
822+ Expression** getChildPointerInImmediateParent (const ExpressionStack& stack,
823+ Index index,
824+ Function* func) {
825+ if (index == 0 ) {
826+ // There is nothing above this expression, so the pointer referring to it
827+ // is the function's body.
828+ return &func->body ;
829+ }
830+
831+ auto * child = stack[index];
832+ for (auto ** currChild : ChildIterator (stack[index - 1 ]).children ) {
833+ if (*currChild == child) {
834+ return currChild;
835+ }
836+ }
837+
838+ WASM_UNREACHABLE (" child not found in parent" );
839+ }
534840};
535841
536842Pass* createPrecomputePass () { return new Precompute (false ); }
0 commit comments