Skip to content

JIT: Check for potential store-to-load forwarding before reordering ldr -> ldp #105695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -6238,6 +6238,7 @@ class Compiler
PhaseStatus fgUpdateFlowGraphPhase();

PhaseStatus fgDfsBlocksAndRemove();
bool fgRemoveBlocksOutsideDfsTree();

PhaseStatus fgFindOperOrder();

Expand Down
98 changes: 54 additions & 44 deletions src/coreclr/jit/fgopt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5742,72 +5742,82 @@ PhaseStatus Compiler::fgDfsBlocksAndRemove()
fgInvalidateDfsTree();
m_dfsTree = fgComputeDfs();

PhaseStatus status = PhaseStatus::MODIFIED_NOTHING;
if (m_dfsTree->GetPostOrderCount() != fgBBcount)
return fgRemoveBlocksOutsideDfsTree() ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING;
}

//-------------------------------------------------------------
// fgRemoveBlocksOutsideDfsTree: Remove the blocks that are not in the current DFS tree.
//
// Returns:
// True if any block was removed.
//
bool Compiler::fgRemoveBlocksOutsideDfsTree()
{
if (m_dfsTree->GetPostOrderCount() == fgBBcount)
{
return false;
}

#ifdef DEBUG
if (verbose)
if (verbose)
{
printf("%u/%u blocks are unreachable and will be removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount(),
fgBBcount);
for (BasicBlock* block : Blocks())
{
printf("%u/%u blocks are unreachable and will be removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount(),
fgBBcount);
for (BasicBlock* block : Blocks())
if (!m_dfsTree->Contains(block))
{
if (!m_dfsTree->Contains(block))
{
printf(" " FMT_BB "\n", block->bbNum);
}
printf(" " FMT_BB "\n", block->bbNum);
}
}
}
#endif // DEBUG

// The DFS we run is not precise around call-finally, so
// `fgRemoveUnreachableBlocks` can expose newly unreachable blocks
// that we did not uncover during the DFS. If we did remove any
// call-finally blocks then iterate to closure. This is a very rare
// case.
while (true)
{
bool anyCallFinallyPairs = false;
fgRemoveUnreachableBlocks([=, &anyCallFinallyPairs](BasicBlock* block) {
if (!m_dfsTree->Contains(block))
{
anyCallFinallyPairs |= block->isBBCallFinallyPair();
return true;
}

return false;
});

if (!anyCallFinallyPairs)
// The DFS we run is not precise around call-finally, so
// `fgRemoveUnreachableBlocks` can expose newly unreachable blocks
// that we did not uncover during the DFS. If we did remove any
// call-finally blocks then iterate to closure. This is a very rare
// case.
while (true)
{
bool anyCallFinallyPairs = false;
fgRemoveUnreachableBlocks([=, &anyCallFinallyPairs](BasicBlock* block) {
if (!m_dfsTree->Contains(block))
{
break;
anyCallFinallyPairs |= block->isBBCallFinallyPair();
return true;
}

m_dfsTree = fgComputeDfs();
return false;
});

if (!anyCallFinallyPairs)
{
break;
}

m_dfsTree = fgComputeDfs();
}

#ifdef DEBUG
// Did we actually remove all the blocks we said we were going to?
if (verbose)
// Did we actually remove all the blocks we said we were going to?
if (verbose)
{
if (m_dfsTree->GetPostOrderCount() != fgBBcount)
{
if (m_dfsTree->GetPostOrderCount() != fgBBcount)
printf("%u unreachable blocks were not removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount());
for (BasicBlock* block : Blocks())
{
printf("%u unreachable blocks were not removed:\n", fgBBcount - m_dfsTree->GetPostOrderCount());
for (BasicBlock* block : Blocks())
if (!m_dfsTree->Contains(block))
{
if (!m_dfsTree->Contains(block))
{
printf(" " FMT_BB "\n", block->bbNum);
}
printf(" " FMT_BB "\n", block->bbNum);
}
}
}
#endif // DEBUG

status = PhaseStatus::MODIFIED_EVERYTHING;
}
#endif // DEBUG

return status;
return true;
}

//-------------------------------------------------------------
Expand Down
204 changes: 201 additions & 3 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1295,6 +1295,8 @@ GenTree* Lowering::LowerSwitch(GenTree* node)
switchBBRange.Remove(node->AsOp()->gtOp1);
switchBBRange.Remove(node);

comp->fgInvalidateDfsTree();

return next;
}

Expand Down Expand Up @@ -7695,9 +7697,17 @@ PhaseStatus Lowering::DoPhase()
const bool setSlotNumbers = false;
comp->lvaComputeRefCounts(isRecompute, setSlotNumbers);

// Remove dead blocks and compute DFS (we want to remove unreachable blocks
// even in MinOpts).
comp->fgDfsBlocksAndRemove();
if (comp->m_dfsTree == nullptr)
{
// Compute DFS tree. We want to remove dead blocks even in MinOpts, so we
// do this everywhere. The dead blocks are removed below, however, some of
// lowering may use the DFS tree, so we compute that here.
comp->m_dfsTree = comp->fgComputeDfs();
}

// Remove dead blocks. We want to remove unreachable blocks even in
// MinOpts.
comp->fgRemoveBlocksOutsideDfsTree();

if (comp->backendRequiresLocalVarLifetimes())
{
Expand Down Expand Up @@ -9519,6 +9529,17 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
}
}

// For some hardware the ldr -> ldp transformation can result in missed
// store-to-load forwarding opportunities, which can seriously harm
// performance. Here we do a best effort check to see if one of the loads
// we are combining may be loading from a store that reaches the load
// without redefining the address.
if (prevIndir->OperIsLoad() && indir->OperIsLoad() && IsStoreToLoadForwardingCandidateInLoop(prevIndir, indir))
{
JITDUMP("Avoiding making indirs adjacent; this may be the target of a store-to-load forwarding candidate\n");
return false;
}

JITDUMP("Moving nodes that are not part of data flow of [%06u]\n\n", Compiler::dspTreeID(indir));

GenTree* previous = prevIndir;
Expand Down Expand Up @@ -9565,6 +9586,183 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
return true;
}

//------------------------------------------------------------------------
// IsStoreToLoadForwardingCandidateInLoop: Check if one of the specified
// indirections may be the target of store-to-load forwarding from an indirect
// store that reaches one of the loads and that happens within the same loop.
// In those cases the transformation to 'ldp' can break this hardware
// optimization for some hardware.
//
// Arguments:
// prevIndir - First indirection
// indir - Second indirection
//
// Returns:
// True if so.
//
bool Lowering::IsStoreToLoadForwardingCandidateInLoop(GenTreeIndir* prevIndir, GenTreeIndir* indir)
{
if (comp->m_dfsTree == nullptr)
{
comp->m_dfsTree = comp->fgComputeDfs();
}

if (!comp->m_dfsTree->HasCycle())
{
return false;
}

if (comp->m_loops == nullptr)
{
comp->m_loops = FlowGraphNaturalLoops::Find(comp->m_dfsTree);
comp->m_blockToLoop = BlockToNaturalLoopMap::Build(comp->m_loops);
}

FlowGraphNaturalLoop* loop = comp->m_blockToLoop->GetLoop(m_block);
if (loop == nullptr)
{
return false;
}

GenTree* addr1 = prevIndir->Addr();
target_ssize_t offs1;
comp->gtPeelOffsets(&addr1, &offs1);
unsigned lcl1 = addr1->OperIs(GT_LCL_VAR) ? addr1->AsLclVarCommon()->GetLclNum() : BAD_VAR_NUM;

GenTree* addr2 = indir->Addr();
target_ssize_t offs2;
comp->gtPeelOffsets(&addr2, &offs2);
unsigned lcl2 = addr1->OperIs(GT_LCL_VAR) ? addr2->AsLclVarCommon()->GetLclNum() : BAD_VAR_NUM;

unsigned budget = 100;

// Starting at an end node, go backwards until the specified first node and look for
// 1) Definitions of the base address local, which invalidates future store-to-load forwarding
// 2) Stores to the same address as is being loaded later, which allows store-to-load forwarding
auto checkNodes = [=, &budget](GenTree* lastNode, GenTree* firstNode, bool* hasStore, bool* hasDef) {
*hasStore = false;
*hasDef = false;

for (GenTree* curNode = lastNode;; curNode = curNode->gtPrev)
{
if (curNode->OperIs(GT_STORE_LCL_VAR))
{
unsigned lclNum = curNode->AsLclVarCommon()->GetLclNum();
if ((lclNum == lcl1) || (lclNum == lcl2))
{
*hasDef = true;
return true;
}
}
else if (curNode->OperIs(GT_STOREIND))
{
GenTreeIndir* storeInd = curNode->AsIndir();
GenTree* storeIndirAddr = storeInd->Addr();
target_ssize_t storeIndirOffs;
comp->gtPeelOffsets(&storeIndirAddr, &storeIndirOffs);

if (storeIndirAddr->OperIs(GT_LCL_VAR) && ((storeIndirOffs == offs1) || (storeIndirOffs == offs2)))
{
unsigned storeIndirAddrLcl = storeIndirAddr->AsLclVarCommon()->GetLclNum();
if ((storeIndirAddrLcl == lcl1) || (storeIndirAddrLcl == lcl2))
{
JITDUMP("Store at [%06u] may allow store-to-load forwarding of indir [%06u]\n",
Compiler::dspTreeID(curNode),
Compiler::dspTreeID(storeIndirAddrLcl == lcl1 ? prevIndir : indir));

*hasStore = true;
return true;
}
}
}

if (curNode == firstNode)
{
break;
}

if (--budget == 0)
{
return false;
}
}

return true;
};

bool hasStore;
bool hasDef;
if (!checkNodes(prevIndir, LIR::AsRange(m_block).FirstNode(), &hasStore, &hasDef))
{
// Out of budget
return false;
}

if (hasStore)
{
// Have a store before the indir; it could be store-to-load forwarded.
return true;
}

if (hasDef)
{
// Have a def before the indir; it would break store-to-load
// forwarding. No preds to push then, so we are done.
return false;
}

// Now we've checked range before the indirs; continue with its preds
// inside the loop. We will check the range after the indirs once we get to
// it.
BitVecTraits traits = comp->m_dfsTree->PostOrderTraits();
BitVec visited(BitVecOps::MakeEmpty(&traits));

ArrayStack<BasicBlock*> stack(comp->getAllocator(CMK_ArrayStack));

auto pushPreds = [=, &traits, &visited, &stack](BasicBlock* block) {
for (BasicBlock* pred : block->PredBlocks())
{
if (loop->ContainsBlock(pred) && BitVecOps::TryAddElemD(&traits, visited, pred->bbPostorderNum))
{
stack.Push(pred);
}
}
};

pushPreds(m_block);
Copy link
Member

@AndyAyersMS AndyAyersMS Jul 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lower runs in linear block order, is there any concern that we might not have yet lowered some of these preds?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I presume it's not important since it's just a heuritics and STORIND for primitives look generally the same before & after lower (except the addressing modes)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I don't think there is a concern there. gtPeelOffsets that this is using works both on pre and post lowered forms of address modes.


while (stack.Height() > 0)
{
BasicBlock* block = stack.Pop();

LIR::Range& range = LIR::AsRange(block);

GenTree* firstNode = block == m_block ? prevIndir : range.FirstNode();

if ((firstNode != nullptr) && !checkNodes(range.LastNode(), firstNode, &hasStore, &hasDef))
{
// Out of budget
return false;
}

if (hasStore)
{
// This would be store-to-load forwardable.
return true;
}

if (hasDef)
{
// Redefinition of base local; skip pushing preds
continue;
}

pushPreds(block);
}

return false;
}

//------------------------------------------------------------------------
// MarkTree: Mark trees involved in the computation of 'node' recursively.
//
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ class Lowering final : public Phase
GenTree* LowerIndir(GenTreeIndir* ind);
bool OptimizeForLdpStp(GenTreeIndir* ind);
bool TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir);
bool IsStoreToLoadForwardingCandidateInLoop(GenTreeIndir* prevIndir, GenTreeIndir* indir);
bool TryMoveAddSubRMWAfterIndir(GenTreeLclVarCommon* store);
bool TryMakeIndirAndStoreAdjacent(GenTreeIndir* prevIndir, GenTreeLclVarCommon* store);
void MarkTree(GenTree* root);
Expand Down
Loading