Skip to content

Commit ff5d6f9

Browse files
authored
Loop Invariant Code Motion (#1658)
This adds an licm pass. Not that important for LLVM-originating code obviously, but for AssemblyScript and other non-LLVM compilers this might help a lot. Also when wasm has GC a bunch more non-LLVM languages may arrive that can benefit. The pass is mostly straightforward. I considered using the DataFlow IR since it's in SSA form, or the CFG IR, but in the end it's actually pretty convenient to use the main IR as it is - with explicit loops already present - plus LocalGraph which connects each get to the sets influencing it. Passed a bunch of fuzzing, and also the emscripten test suite at -O1 with licm added to the default passes (but I don't think it would make sense to run this by default, as LLVM doesn't need it). We limit code moved by this pass as follows: An increased code size on fuzz testcases (and, more rarely, on real inputs) can happen due to stuff like this: (loop (set_local $x (i32.const 1)) .. ) => (set_local $x (i32.const 1)) (loop .. ) For a const or a get_local, such an assignment to a local is both very cheap (a copy to another local may be optimized out later), and moving it out may prevent other optimizations (since we have no pass that tries to move code back in to a loop edit well, not by default, precompute-propagate etc. would do it, but are only run on high opt levels). So I made the pass not move such trivial code (sets/tees of consts or gets). However, the risk remains if code is moved out that is later reduced to a constant, so something like -Os --flatten --licm -Os may make sense.
1 parent a852156 commit ff5d6f9

File tree

9 files changed

+1335
-3
lines changed

9 files changed

+1335
-3
lines changed

build-js.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ echo "building shared bitcode"
106106
$BINARYEN_SRC/passes/LegalizeJSInterface.cpp \
107107
$BINARYEN_SRC/passes/LocalCSE.cpp \
108108
$BINARYEN_SRC/passes/LogExecution.cpp \
109+
$BINARYEN_SRC/passes/LoopInvariantCodeMotion.cpp \
109110
$BINARYEN_SRC/passes/MemoryPacking.cpp \
110111
$BINARYEN_SRC/passes/MergeBlocks.cpp \
111112
$BINARYEN_SRC/passes/MergeLocals.cpp \

src/ir/effects.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ struct EffectAnalyzer : public PostWalker<EffectAnalyzer> {
3939
if (breakNames.size() > 0) branches = true;
4040
}
4141

42+
// Core effect tracking
43+
4244
bool branches = false; // branches out of this expression, returns, infinite loops, etc
4345
bool calls = false;
4446
std::set<Index> localsRead;
@@ -55,13 +57,18 @@ struct EffectAnalyzer : public PostWalker<EffectAnalyzer> {
5557
bool isAtomic = false; // An atomic load/store/RMW/Cmpxchg or an operator that
5658
// has a defined ordering wrt atomics (e.g. grow_memory)
5759

60+
// Helper functions to check for various effect types
61+
5862
bool accessesLocal() { return localsRead.size() + localsWritten.size() > 0; }
5963
bool accessesGlobal() { return globalsRead.size() + globalsWritten.size() > 0; }
6064
bool accessesMemory() { return calls || readsMemory || writesMemory; }
65+
6166
bool hasGlobalSideEffects() { return calls || globalsWritten.size() > 0 || writesMemory || isAtomic; }
6267
bool hasSideEffects() { return hasGlobalSideEffects() || localsWritten.size() > 0 || branches || implicitTrap; }
6368
bool hasAnything() { return branches || calls || accessesLocal() || readsMemory || writesMemory || accessesGlobal() || implicitTrap || isAtomic; }
6469

70+
bool noticesGlobalSideEffects() { return calls || readsMemory || isAtomic || globalsRead.size(); }
71+
6572
// check if we break to anything external from ourselves
6673
bool hasExternalBreakTargets() { return !breakNames.empty(); }
6774

@@ -113,6 +120,8 @@ struct EffectAnalyzer : public PostWalker<EffectAnalyzer> {
113120
calls = calls || other.calls;
114121
readsMemory = readsMemory || other.readsMemory;
115122
writesMemory = writesMemory || other.writesMemory;
123+
implicitTrap = implicitTrap || other.implicitTrap;
124+
isAtomic = isAtomic || other.isAtomic;
116125
for (auto i : other.localsRead) localsRead.insert(i);
117126
for (auto i : other.localsWritten) localsWritten.insert(i);
118127
for (auto i : other.globalsRead) globalsRead.insert(i);

src/passes/CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,14 @@ SET(passes_SOURCES
1515
ExtractFunction.cpp
1616
Flatten.cpp
1717
FuncCastEmulation.cpp
18+
I64ToI32Lowering.cpp
1819
Inlining.cpp
20+
InstrumentLocals.cpp
21+
InstrumentMemory.cpp
1922
LegalizeJSInterface.cpp
2023
LocalCSE.cpp
2124
LogExecution.cpp
22-
I64ToI32Lowering.cpp
23-
InstrumentLocals.cpp
24-
InstrumentMemory.cpp
25+
LoopInvariantCodeMotion.cpp
2526
MemoryPacking.cpp
2627
MergeBlocks.cpp
2728
MergeLocals.cpp
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
/*
2+
* Copyright 2018 WebAssembly Community Group participants
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
//
18+
// Simple loop invariant code motion (licm): for every none-typed
19+
// expression in a loop, see if we can move it out.
20+
//
21+
// Flattening is not necessary here, but may help (as separating
22+
// out expressions may allow moving at least part of a larger whole).
23+
//
24+
25+
#include <unordered_map>
26+
27+
#include "wasm.h"
28+
#include "pass.h"
29+
#include "wasm-builder.h"
30+
#include "ir/local-graph.h"
31+
#include "ir/effects.h"
32+
#include "ir/find_all.h"
33+
34+
namespace wasm {
35+
36+
struct LoopInvariantCodeMotion : public WalkerPass<ExpressionStackWalker<LoopInvariantCodeMotion>> {
37+
bool isFunctionParallel() override { return true; }
38+
39+
Pass* create() override { return new LoopInvariantCodeMotion; }
40+
41+
typedef std::unordered_set<SetLocal*> LoopSets;
42+
43+
// main entry point
44+
45+
LocalGraph* localGraph;
46+
47+
void doWalkFunction(Function* func) {
48+
// Compute all local dependencies first.
49+
LocalGraph localGraphInstance(func);
50+
localGraph = &localGraphInstance;
51+
// Traverse the function.
52+
super::doWalkFunction(func);
53+
}
54+
55+
void visitLoop(Loop* loop) {
56+
// We accumulate all the code we can move out, and will place it
57+
// in a block just preceding the loop.
58+
std::vector<Expression*> movedCode;
59+
// Accumulate effects of things we can't move out - things
60+
// we move out later must cross them, so we must verify it
61+
// is ok to do so.
62+
EffectAnalyzer effectsSoFar(getPassOptions());
63+
// The loop's total effects also matter. For example, a store
64+
// in the loop means we can't move a load outside.
65+
// FIXME: we look at the loop "tail" area too, after the last
66+
// possible branch back, which can cause false positives
67+
// for bad effect interactions.
68+
EffectAnalyzer loopEffects(getPassOptions(), loop);
69+
// Note all the sets in each loop, and how many per index. Currently
70+
// EffectAnalyzer can't do that, and we need it to know if we
71+
// can move a set out of the loop (if there is another set
72+
// still there, we can't). Another possible option here is for
73+
// LocalGraph to track interfering sets. TODO
74+
// FIXME: also the loop tail issue from above.
75+
auto numLocals = getFunction()->getNumLocals();
76+
std::vector<Index> numSetsForIndex(numLocals);
77+
std::fill(numSetsForIndex.begin(), numSetsForIndex.end(), 0);
78+
LoopSets loopSets;
79+
{
80+
FindAll<SetLocal> finder(loop);
81+
for (auto* set : finder.list) {
82+
numSetsForIndex[set->index]++;
83+
loopSets.insert(set);
84+
}
85+
}
86+
// Walk along the loop entrance, while all the code there
87+
// is executed unconditionally. That is the code we want to
88+
// move out - anything that might or might not be executed
89+
// may be best left alone anyhow.
90+
std::vector<Expression**> work;
91+
work.push_back(&loop->body);
92+
while (!work.empty()) {
93+
auto** currp = work.back();
94+
work.pop_back();
95+
auto* curr = *currp;
96+
// Look into blocks.
97+
if (auto* block = curr->dynCast<Block>()) {
98+
auto& list = block->list;
99+
Index i = list.size();
100+
while (i > 0) {
101+
i--;
102+
work.push_back(&list[i]);
103+
}
104+
continue;
105+
// Note that if the block had a merge at the end, we would have seen
106+
// a branch to it anyhow, so we would stop before that point anyhow.
107+
}
108+
// If this may branch, we are done.
109+
EffectAnalyzer effects(getPassOptions(), curr);
110+
if (effects.branches) {
111+
break;
112+
}
113+
if (interestingToMove(curr)) {
114+
// Let's see if we can move this out.
115+
// Global side effects would prevent this - we might end up
116+
// executing them just once.
117+
// And we must also move across anything not moved out already,
118+
// so check for issues there too.
119+
// The rest of the loop's effects matter too, we must also
120+
// take into account global state like interacting loads and
121+
// stores.
122+
bool unsafeToMove = effects.hasGlobalSideEffects() ||
123+
effectsSoFar.invalidates(effects) ||
124+
(effects.noticesGlobalSideEffects() &&
125+
loopEffects.hasGlobalSideEffects());
126+
if (!unsafeToMove) {
127+
// So far so good. Check if our local dependencies are all
128+
// outside of the loop, in which case everything is good -
129+
// either they are before the loop and constant for us, or
130+
// they are after and don't matter.
131+
if (effects.localsRead.empty() || !hasGetDependingOnLoopSet(curr, loopSets)) {
132+
// We have checked if our gets are influenced by sets in the loop, and
133+
// must also check if our sets interfere with them. To do so, assume
134+
// temporarily that we are moving curr out; see if any sets remain for
135+
// its indexes.
136+
FindAll<SetLocal> currSets(curr);
137+
for (auto* set : currSets.list) {
138+
assert(numSetsForIndex[set->index] > 0);
139+
numSetsForIndex[set->index]--;
140+
}
141+
bool canMove = true;
142+
for (auto* set : currSets.list) {
143+
if (numSetsForIndex[set->index] > 0) {
144+
canMove = false;
145+
break;
146+
}
147+
}
148+
if (!canMove) {
149+
// We failed to move the code, undo those changes.
150+
for (auto* set : currSets.list) {
151+
numSetsForIndex[set->index]++;
152+
}
153+
} else {
154+
// We can move it! Leave the changes, move the code, and update
155+
// loopSets.
156+
movedCode.push_back(curr);
157+
*currp = Builder(*getModule()).makeNop();
158+
for (auto* set : currSets.list) {
159+
loopSets.erase(set);
160+
}
161+
continue;
162+
}
163+
}
164+
}
165+
}
166+
// We did not move this item. Accumulate its effects.
167+
effectsSoFar.mergeIn(effects);
168+
}
169+
// If we moved the code out, finish up by emitting it
170+
// outside of the loop.
171+
// Note that this works with nested loops - after moving outside
172+
// of an inner loop, we can encounter it again in an outer loop,
173+
// and move it further outside, without requiring any extra pass.
174+
if (!movedCode.empty()) {
175+
// Finish the moving by emitting the code outside.
176+
Builder builder(*getModule());
177+
auto* ret = builder.makeBlock(movedCode);
178+
ret->list.push_back(loop);
179+
ret->finalize(loop->type);
180+
replaceCurrent(ret);
181+
// Note that we do not need to modify the localGraph - we keep
182+
// each get in a position to be influenced by exactly the same
183+
// sets as before.
184+
}
185+
}
186+
187+
bool interestingToMove(Expression* curr) {
188+
// In theory we could consider blocks, but then heavy nesting of
189+
// switch patterns would be heavy, and almost always pointless.
190+
if (curr->type != none || curr->is<Nop>() || curr->is<Block>()
191+
|| curr->is<Loop>()) {
192+
return false;
193+
}
194+
// Don't move copies (set of a get, or set of a tee of a get, etc.),
195+
// as they often do not turn into actual work inside the loop
196+
// (after later optimizations by us or the VM), and if we move them
197+
// out it will prevent other opts from potentially eliminating the
198+
// copy, as we don't have another pass that considers moving code
199+
// back *into* loops.
200+
// Likewise, constants also are not worth moving out.
201+
// Note that the issue remains for moving things out which later
202+
// optimizations turn into a copy or a constant, so in general
203+
// it is beneficial to run this pass later on (but that has downsides
204+
// too, as with more nesting moving code is harder - so something
205+
// like -O --flatten --licm -O may be best).
206+
if (auto* set = curr->dynCast<SetLocal>()) {
207+
while (1) {
208+
auto* next = set->value->dynCast<SetLocal>();
209+
if (!next) break;
210+
set = next;
211+
}
212+
if (set->value->is<GetLocal>() || set->value->is<Const>()) {
213+
return false;
214+
}
215+
}
216+
return true;
217+
}
218+
219+
bool hasGetDependingOnLoopSet(Expression* curr, LoopSets& loopSets) {
220+
FindAll<GetLocal> gets(curr);
221+
for (auto* get : gets.list) {
222+
auto& sets = localGraph->getSetses[get];
223+
for (auto* set : sets) {
224+
// nullptr means a parameter or zero-init value;
225+
// no danger to us.
226+
if (!set) continue;
227+
// Check if the set is in the loop. If not, it's either before,
228+
// which is fine, or after, which is also fine - moving curr
229+
// to just outside the loop will preserve those relationships.
230+
// TODO: this still counts curr's sets as inside the loop, which
231+
// might matter in non-flat mode.
232+
if (loopSets.count(set)) {
233+
return true;
234+
}
235+
}
236+
}
237+
return false;
238+
}
239+
};
240+
241+
Pass *createLoopInvariantCodeMotionPass() {
242+
return new LoopInvariantCodeMotion();
243+
}
244+
245+
} // namespace wasm
246+

src/passes/RedundantSetElimination.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ namespace wasm {
4949
// its current value
5050
typedef std::vector<Index> LocalValues;
5151

52+
namespace {
53+
5254
// information in a basic block
5355
struct Info {
5456
LocalValues start, end; // the local values at the start and end of the block
@@ -366,6 +368,8 @@ struct RedundantSetElimination : public WalkerPass<CFGWalker<RedundantSetElimina
366368
}
367369
};
368370

371+
} // namespace
372+
369373
Pass *createRedundantSetEliminationPass() {
370374
return new RedundantSetElimination();
371375
}

src/passes/pass.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ void PassRegistry::registerPasses() {
8787
registerPass("i64-to-i32-lowering", "lower all uses of i64s to use i32s instead", createI64ToI32LoweringPass);
8888
registerPass("instrument-locals", "instrument the build with code to intercept all loads and stores", createInstrumentLocalsPass);
8989
registerPass("instrument-memory", "instrument the build with code to intercept all loads and stores", createInstrumentMemoryPass);
90+
registerPass("licm", "loop invariant code motion", createLoopInvariantCodeMotionPass);
9091
registerPass("memory-packing", "packs memory into separate segments, skipping zeros", createMemoryPackingPass);
9192
registerPass("merge-blocks", "merges blocks to their parents", createMergeBlocksPass);
9293
registerPass("merge-locals", "merges locals when beneficial", createMergeLocalsPass);

src/passes/passes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ Pass* createLocalCSEPass();
4444
Pass* createLogExecutionPass();
4545
Pass* createInstrumentLocalsPass();
4646
Pass* createInstrumentMemoryPass();
47+
Pass* createLoopInvariantCodeMotionPass();
4748
Pass* createMemoryPackingPass();
4849
Pass* createMergeBlocksPass();
4950
Pass* createMergeLocalsPass();

0 commit comments

Comments
 (0)