Skip to content

Commit db717e3

Browse files
authored
JIT: Add a pass of early liveness and use it for forward sub and last-use copy elision for implicit byrefs (#79346)
This runs a pass of liveness right after local morph and uses it for forward sub and to omit copies of structs when passed as implicit byrefs at their last use. Fix #76069 Fix #75206 Fix #65025 Fix #9839 This PR introduces the following new JIT invariants: * When optimizing, local morph will now thread all locals into a tree list accessed by Statement::LocalsTreeList. This tree list is kept valid starting from local morph and ending with forward sub. There is no memory impact of this since we reuse the GenTree::gtPrev and GenTree::gtNext fields. * Early liveness information (GTF_VAR_DEATH and the promoted struct death vars map) is kept valid (sound) starting from early liveness and ending with morph. There are asserts that the tree list is up to date when it is accessed. This is done through a new member fgNodeThreading that replaces the preexisting fgStmtListThreaded and keeps information about what the current kind of node threading is. The benefits are large, -2 MB on win-x64 collections (-0.85% on libraries.pmi that only has optimized contexts), with a number of regressions as expected when removing locals. The improvements primarily come from the omission of copies for implicit byrefs, so the benefits on platforms with fewer implicit byrefs is smaller, but the forward sub change alone is still very impactful (e.g. -300K on linux-x64). The throughput impact is around 1% in optimized contexts and below 0.1% in unoptimized contexts, the latter due to local morph needing to check if it should be threading nodes.
1 parent 7c265c3 commit db717e3

29 files changed

+1585
-270
lines changed

src/coreclr/jit/assertionprop.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3651,6 +3651,15 @@ GenTree* Compiler::optCopyAssertionProp(AssertionDsc* curAssertion,
36513651
tree->SetLclNum(copyLclNum);
36523652
tree->SetSsaNum(copySsaNum);
36533653

3654+
// Copy prop and last-use copy elision happens at the same time in morph.
3655+
// This node may potentially not be a last use of the new local.
3656+
//
3657+
// TODO-CQ: It is probably better to avoid doing this propagation if we
3658+
// would otherwise omit an implicit byref copy since this propagation will
3659+
// force us to create another copy anyway.
3660+
//
3661+
tree->gtFlags &= ~GTF_VAR_DEATH;
3662+
36543663
#ifdef DEBUG
36553664
if (verbose)
36563665
{

src/coreclr/jit/block.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,8 @@ enum BasicBlockFlags : unsigned __int64
542542
BBF_BACKWARD_JUMP_SOURCE = MAKE_BBFLAG(41), // Block is a source of a backward jump
543543
BBF_HAS_MDARRAYREF = MAKE_BBFLAG(42), // Block has a multi-dimensional array reference
544544

545+
BBF_RECURSIVE_TAILCALL = MAKE_BBFLAG(43), // Block has recursive tailcall that may turn into a loop
546+
545547
// The following are sets of flags.
546548

547549
// Flags that relate blocks to loop structure.
@@ -562,7 +564,7 @@ enum BasicBlockFlags : unsigned __int64
562564
// For example, the top block might or might not have BBF_GC_SAFE_POINT,
563565
// but we assume it does not have BBF_GC_SAFE_POINT any more.
564566

565-
BBF_SPLIT_LOST = BBF_GC_SAFE_POINT | BBF_HAS_JMP | BBF_KEEP_BBJ_ALWAYS | BBF_CLONED_FINALLY_END,
567+
BBF_SPLIT_LOST = BBF_GC_SAFE_POINT | BBF_HAS_JMP | BBF_KEEP_BBJ_ALWAYS | BBF_CLONED_FINALLY_END | BBF_RECURSIVE_TAILCALL,
566568

567569
// Flags gained by the bottom block when a block is split.
568570
// Note, this is a conservative guess.

src/coreclr/jit/compiler.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4391,8 +4391,8 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
43914391

43924392
// Enable the post-phase checks that use internal logic to decide when checking makes sense.
43934393
//
4394-
activePhaseChecks =
4395-
PhaseChecks::CHECK_EH | PhaseChecks::CHECK_LOOPS | PhaseChecks::CHECK_UNIQUE | PhaseChecks::CHECK_PROFILE;
4394+
activePhaseChecks = PhaseChecks::CHECK_EH | PhaseChecks::CHECK_LOOPS | PhaseChecks::CHECK_UNIQUE |
4395+
PhaseChecks::CHECK_PROFILE | PhaseChecks::CHECK_LINKED_LOCALS;
43964396

43974397
// Import: convert the instrs in each basic block to a tree based intermediate representation
43984398
//
@@ -4604,10 +4604,23 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
46044604
//
46054605
DoPhase(this, PHASE_STR_ADRLCL, &Compiler::fgMarkAddressExposedLocals);
46064606

4607+
if (opts.OptimizationEnabled())
4608+
{
4609+
fgNodeThreading = NodeThreading::AllLocals;
4610+
}
4611+
4612+
// Do an early pass of liveness for forward sub and morph. This data is
4613+
// valid until after morph.
4614+
//
4615+
DoPhase(this, PHASE_EARLY_LIVENESS, &Compiler::fgEarlyLiveness);
4616+
46074617
// Run a simple forward substitution pass.
46084618
//
46094619
DoPhase(this, PHASE_FWD_SUB, &Compiler::fgForwardSub);
46104620

4621+
// Locals tree list is no longer kept valid.
4622+
fgNodeThreading = NodeThreading::None;
4623+
46114624
// Apply the type update to implicit byref parameters; also choose (based on address-exposed
46124625
// analysis) which implicit byref promotions to keep (requires copy to initialize) or discard.
46134626
//
@@ -4750,6 +4763,8 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
47504763
//
47514764
DoPhase(this, PHASE_SET_BLOCK_ORDER, &Compiler::fgSetBlockOrder);
47524765

4766+
fgNodeThreading = NodeThreading::AllTrees;
4767+
47534768
// At this point we know if we are fully interruptible or not
47544769
if (opts.OptimizationEnabled())
47554770
{
@@ -4942,6 +4957,8 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
49424957
Rationalizer rat(this); // PHASE_RATIONALIZE
49434958
rat.Run();
49444959

4960+
fgNodeThreading = NodeThreading::LIR;
4961+
49454962
// Here we do "simple lowering". When the RyuJIT backend works for all
49464963
// platforms, this will be part of the more general lowering phase. For now, though, we do a separate
49474964
// pass of "final lowering." We must do this before (final) liveness analysis, because this creates

src/coreclr/jit/compiler.h

Lines changed: 63 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,13 +1454,14 @@ extern const char* PhaseEnums[];
14541454
// clang-format off
14551455
enum class PhaseChecks : unsigned int
14561456
{
1457-
CHECK_NONE = 0,
1458-
CHECK_IR = 1 << 0, // ir flags, etc
1459-
CHECK_UNIQUE = 1 << 1, // tree node uniqueness
1460-
CHECK_FG = 1 << 2, // flow graph integrity
1461-
CHECK_EH = 1 << 3, // eh table integrity
1462-
CHECK_LOOPS = 1 << 4, // loop table integrity
1463-
CHECK_PROFILE = 1 << 5, // profile data integrity
1457+
CHECK_NONE = 0,
1458+
CHECK_IR = 1 << 0, // ir flags, etc
1459+
CHECK_UNIQUE = 1 << 1, // tree node uniqueness
1460+
CHECK_FG = 1 << 2, // flow graph integrity
1461+
CHECK_EH = 1 << 3, // eh table integrity
1462+
CHECK_LOOPS = 1 << 4, // loop table integrity
1463+
CHECK_PROFILE = 1 << 5, // profile data integrity
1464+
CHECK_LINKED_LOCALS = 1 << 6, // check linked list of locals
14641465
};
14651466

14661467
inline constexpr PhaseChecks operator ~(PhaseChecks a)
@@ -1860,6 +1861,16 @@ struct RichIPMapping
18601861
DebugInfo debugInfo;
18611862
};
18621863

1864+
// Current kind of node threading stored in GenTree::gtPrev and GenTree::gtNext.
1865+
// See fgNodeThreading for more information.
1866+
enum class NodeThreading
1867+
{
1868+
None,
1869+
AllLocals, // Locals are threaded (after local morph when optimizing)
1870+
AllTrees, // All nodes are threaded (after gtSetBlockOrder)
1871+
LIR, // Nodes are in LIR form (after rationalization)
1872+
};
1873+
18631874
/*
18641875
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18651876
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
@@ -1910,6 +1921,7 @@ class Compiler
19101921
friend class LIR;
19111922
friend class ObjectAllocator;
19121923
friend class LocalAddressVisitor;
1924+
friend struct Statement;
19131925
friend struct GenTree;
19141926
friend class MorphInitBlockHelper;
19151927
friend class MorphCopyBlockHelper;
@@ -2789,7 +2801,7 @@ class Compiler
27892801
// is #of nodes in subtree) of "tree" is greater than "limit".
27902802
// (This is somewhat redundant with the "GetCostEx()/GetCostSz()" fields, but can be used
27912803
// before they have been set.)
2792-
bool gtComplexityExceeds(GenTree** tree, unsigned limit);
2804+
bool gtComplexityExceeds(GenTree* tree, unsigned limit);
27932805

27942806
GenTree* gtReverseCond(GenTree* tree);
27952807

@@ -4448,30 +4460,47 @@ class Compiler
44484460
bool fgRemoveRestOfBlock; // true if we know that we will throw
44494461
bool fgStmtRemoved; // true if we remove statements -> need new DFA
44504462

4451-
// There are two modes for ordering of the trees.
4452-
// - In FGOrderTree, the dominant ordering is the tree order, and the nodes contained in
4453-
// each tree and sub-tree are contiguous, and can be traversed (in gtNext/gtPrev order)
4454-
// by traversing the tree according to the order of the operands.
4455-
// - In FGOrderLinear, the dominant ordering is the linear order.
4456-
44574463
enum FlowGraphOrder
44584464
{
44594465
FGOrderTree,
44604466
FGOrderLinear
44614467
};
4468+
// There are two modes for ordering of the trees.
4469+
// - In FGOrderTree, the dominant ordering is the tree order, and the nodes contained in
4470+
// each tree and sub-tree are contiguous, and can be traversed (in gtNext/gtPrev order)
4471+
// by traversing the tree according to the order of the operands.
4472+
// - In FGOrderLinear, the dominant ordering is the linear order.
44624473
FlowGraphOrder fgOrder;
44634474

4464-
// The following are boolean flags that keep track of the state of internal data structures
4465-
4466-
bool fgStmtListThreaded; // true if the node list is now threaded
4467-
bool fgCanRelocateEHRegions; // true if we are allowed to relocate the EH regions
4468-
bool fgEdgeWeightsComputed; // true after we have called fgComputeEdgeWeights
4469-
bool fgHaveValidEdgeWeights; // true if we were successful in computing all of the edge weights
4470-
bool fgSlopUsedInEdgeWeights; // true if their was some slop used when computing the edge weights
4471-
bool fgRangeUsedInEdgeWeights; // true if some of the edgeWeight are expressed in Min..Max form
4472-
weight_t fgCalledCount; // count of the number of times this method was called
4473-
// This is derived from the profile data
4474-
// or is BB_UNITY_WEIGHT when we don't have profile data
4475+
// The following are flags that keep track of the state of internal data structures
4476+
4477+
// Even in tree form (fgOrder == FGOrderTree) the trees are threaded in a
4478+
// doubly linked lists during certain phases of the compilation.
4479+
// - Local morph threads all locals to be used for early liveness and
4480+
// forward sub when optimizing. This is kept valid until after forward sub.
4481+
// The first local is kept in Statement::GetRootNode()->gtNext and the last
4482+
// local in Statement::GetRootNode()->gtPrev. fgSequenceLocals can be used
4483+
// to (re-)sequence a statement into this form, and
4484+
// Statement::LocalsTreeList for range-based iteration. The order must
4485+
// match tree order.
4486+
//
4487+
// - fgSetBlockOrder threads all nodes. This is kept valid until LIR form.
4488+
// In this form the first node is given by Statement::GetTreeList and the
4489+
// last node is given by Statement::GetRootNode(). fgSetStmtSeq can be used
4490+
// to (re-)sequence a statement into this form, and Statement::TreeList for
4491+
// range-based iteration. The order must match tree order.
4492+
//
4493+
// - Rationalization links all nodes into linear form which is kept until
4494+
// the end of compilation. The first and last nodes are stored in the block.
4495+
NodeThreading fgNodeThreading;
4496+
bool fgCanRelocateEHRegions; // true if we are allowed to relocate the EH regions
4497+
bool fgEdgeWeightsComputed; // true after we have called fgComputeEdgeWeights
4498+
bool fgHaveValidEdgeWeights; // true if we were successful in computing all of the edge weights
4499+
bool fgSlopUsedInEdgeWeights; // true if their was some slop used when computing the edge weights
4500+
bool fgRangeUsedInEdgeWeights; // true if some of the edgeWeight are expressed in Min..Max form
4501+
weight_t fgCalledCount; // count of the number of times this method was called
4502+
// This is derived from the profile data
4503+
// or is BB_UNITY_WEIGHT when we don't have profile data
44754504

44764505
#if defined(FEATURE_EH_FUNCLETS)
44774506
bool fgFuncletsCreated; // true if the funclet creation phase has been run
@@ -4724,6 +4753,8 @@ class Compiler
47244753
GenTreeLclVarCommon* lclVarNode);
47254754
bool fgComputeLifeLocal(VARSET_TP& life, VARSET_VALARG_TP keepAliveVars, GenTree* lclVarNode);
47264755

4756+
GenTree* fgTryRemoveDeadStoreEarly(Statement* stmt, GenTreeLclVarCommon* dst);
4757+
47274758
void fgComputeLife(VARSET_TP& life,
47284759
GenTree* startNode,
47294760
GenTree* endNode,
@@ -5419,6 +5450,7 @@ class Compiler
54195450
void fgDebugCheckLinks(bool morphTrees = false);
54205451
void fgDebugCheckStmtsList(BasicBlock* block, bool morphTrees);
54215452
void fgDebugCheckNodeLinks(BasicBlock* block, Statement* stmt);
5453+
void fgDebugCheckLinkedLocals();
54225454
void fgDebugCheckNodesUniqueness();
54235455
void fgDebugCheckLoopTable();
54245456
void fgDebugCheckSsa();
@@ -5835,6 +5867,8 @@ class Compiler
58355867

58365868
bool byrefStatesMatchGcHeapStates; // True iff GcHeap and ByrefExposed memory have all the same def points.
58375869

5870+
PhaseStatus fgEarlyLiveness();
5871+
58385872
void fgMarkUseDef(GenTreeLclVarCommon* tree);
58395873

58405874
void fgBeginScopeLife(VARSET_TP* inScope, VarScopeDsc* var);
@@ -5926,11 +5960,12 @@ class Compiler
59265960
void fgMarkDemotedImplicitByRefArgs();
59275961

59285962
PhaseStatus fgMarkAddressExposedLocals();
5929-
void fgMarkAddressExposedLocals(Statement* stmt);
5963+
void fgSequenceLocals(Statement* stmt);
59305964

59315965
PhaseStatus fgForwardSub();
59325966
bool fgForwardSubBlock(BasicBlock* block);
59335967
bool fgForwardSubStatement(Statement* statement);
5968+
void fgForwardSubUpdateLiveness(GenTree* newSubListFirst, GenTree* newSubListLast);
59345969

59355970
// The given local variable, required to be a struct variable, is being assigned via
59365971
// a "lclField", to make it masquerade as an integral type in the ABI. Make sure that
@@ -9049,6 +9084,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
90499084

90509085
bool fgLocalVarLivenessDone; // Note that this one is used outside of debug.
90519086
bool fgLocalVarLivenessChanged;
9087+
bool fgIsDoingEarlyLiveness;
9088+
bool fgDidEarlyLiveness;
90529089
bool compLSRADone;
90539090
bool compRationalIRForm;
90549091

@@ -10991,7 +11028,6 @@ class GenTreeVisitor
1099111028
if (TVisitor::UseExecutionOrder && node->IsReverseOp())
1099211029
{
1099311030
assert(node->AsMultiOp()->GetOperandCount() == 2);
10994-
1099511031
result = WalkTree(&node->AsMultiOp()->Op(2), node);
1099611032
if (result == fgWalkResult::WALK_ABORT)
1099711033
{

src/coreclr/jit/compphases.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ CompPhaseNameMacro(PHASE_UPDATE_FINALLY_FLAGS, "Update finally target flag
4343
CompPhaseNameMacro(PHASE_COMPUTE_PREDS, "Compute preds", false, -1, false)
4444
CompPhaseNameMacro(PHASE_EARLY_UPDATE_FLOW_GRAPH, "Update flow graph early pass", false, -1, false)
4545
CompPhaseNameMacro(PHASE_STR_ADRLCL, "Morph - Structs/AddrExp", false, -1, false)
46+
CompPhaseNameMacro(PHASE_EARLY_LIVENESS, "Early liveness", false, -1, false)
4647
CompPhaseNameMacro(PHASE_FWD_SUB, "Forward Substitution", false, -1, false)
4748
CompPhaseNameMacro(PHASE_MORPH_IMPBYREF, "Morph - ByRefs", false, -1, false)
4849
CompPhaseNameMacro(PHASE_PROMOTE_STRUCTS, "Morph - Promote Structs", false, -1, false)

src/coreclr/jit/earlyprop.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -650,7 +650,7 @@ bool Compiler::optIsNullCheckFoldingLegal(GenTree* tree,
650650
// until we get to the indirection or process the statement root.
651651
GenTree* previousTree = nullCheckTree;
652652
GenTree* currentTree = nullCheckTree->gtNext;
653-
assert(fgStmtListThreaded);
653+
assert(fgNodeThreading == NodeThreading::AllTrees);
654654
while (canRemoveNullCheck && (currentTree != tree) && (currentTree != nullptr))
655655
{
656656
if ((*nullCheckParent == nullptr) && currentTree->TryGetUse(nullCheckTree))

src/coreclr/jit/fgbasic.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,12 @@ void Compiler::fgInit()
8787
#endif // DEBUG
8888

8989
fgLocalVarLivenessDone = false;
90+
fgIsDoingEarlyLiveness = false;
91+
fgDidEarlyLiveness = false;
9092

9193
/* Statement list is not threaded yet */
9294

93-
fgStmtListThreaded = false;
95+
fgNodeThreading = NodeThreading::None;
9496

9597
// Initialize the logic for adding code. This is used to insert code such
9698
// as the code that raises an exception when an array range check fails.

0 commit comments

Comments
 (0)