From 209898103eb2ddb787e79c0cee3727e53f7f3bc4 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 14 Oct 2024 15:37:27 -0700 Subject: [PATCH] JIT: very simple cloning heuristic (#108771) Avoid cloning large loops. We compute loop size by counting tree nodes of all statements of all blocks in the loop. If this is over a threshold, we inhibit cloning. Threshold value was chosen based on distribution of unrestricted cloned loop sizes in the benchmark run_pgo collection. --- src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/jitconfigvalues.h | 6 +- src/coreclr/jit/loopcloning.cpp | 96 ++++++++++++++++++++++++++++++- src/coreclr/jit/loopcloning.h | 15 ++++- 4 files changed, 112 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 5d5fe6ef84b52..cd74ea1c96edd 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7091,6 +7091,7 @@ class Compiler bool optCanonicalizeExit(FlowGraphNaturalLoop* loop, BasicBlock* exit); PhaseStatus optCloneLoops(); + bool optShouldCloneLoop(FlowGraphNaturalLoop* loop, LoopCloneContext* context); void optCloneLoop(FlowGraphNaturalLoop* loop, LoopCloneContext* context); PhaseStatus optUnrollLoops(); // Unrolls loops (needs to have cost info) bool optTryUnrollLoop(FlowGraphNaturalLoop* loop, bool* changedIR); diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 94d4e59569069..23158d49342de 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -59,8 +59,10 @@ CONFIG_INTEGER(JitBreakMorphTree, W("JitBreakMorphTree"), 0xffffffff) CONFIG_INTEGER(JitBreakOnBadCode, W("JitBreakOnBadCode"), 0) CONFIG_INTEGER(JitBreakOnMinOpts, W("JITBreakOnMinOpts"), 0) // Halt if jit switches to MinOpts CONFIG_INTEGER(JitCloneLoops, W("JitCloneLoops"), 1) // If 0, don't clone. Otherwise clone loops for optimizations. -CONFIG_INTEGER(JitCloneLoopsWithGdvTests, W("JitCloneLoopsWithGdvTests"), 1) // If 0, don't clone loops based on - // invariant type/method address tests +CONFIG_INTEGER(JitCloneLoopsWithGdvTests, W("JitCloneLoopsWithGdvTests"), 1) // If 0, don't clone loops based on + // invariant type/method address tests +RELEASE_CONFIG_INTEGER(JitCloneLoopsSizeLimit, W("JitCloneLoopsSizeLimit"), 400) // limit cloning to loops with less + // than this many tree nodes CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In debug builds log places where loop cloning // optimizations are performed on the fast path. CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index 0a74288cb0b7e..ee2f6077a3c50 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -1773,6 +1773,94 @@ void Compiler::optPerformStaticOptimizations(FlowGraphNaturalLoop* loop, } } +//------------------------------------------------------------------------ +// optShouldCloneLoop: Decide if a loop that can be cloned should be cloned. +// +// Arguments: +// loop - the current loop for which the optimizations are performed. +// context - data structure where all loop cloning info is kept. +// +// Returns: +// true if expected performance gain from cloning is worth the potential +// size increase. +// +// Remarks: +// This is a simple-minded heuristic meant to avoid "runaway" cloning +// where large loops are cloned. +// +// We estimate the size cost of cloning by summing up the number of +// tree nodes in all statements in all blocks in the loop. +// +// This value is compared to a hard-coded threshold, and if bigger, +// then the method returns false. +// +bool Compiler::optShouldCloneLoop(FlowGraphNaturalLoop* loop, LoopCloneContext* context) +{ + // Compute loop size + // + unsigned loopSize = 0; + + // For now we use a very simplistic model where each tree node + // has the same code size. + // + // CostSz is not available until later. + // + struct TreeCostWalker : GenTreeVisitor + { + enum + { + DoPreOrder = true, + }; + + unsigned m_nodeCount; + + TreeCostWalker(Compiler* comp) + : GenTreeVisitor(comp) + , m_nodeCount(0) + { + } + + fgWalkResult PreOrderVisit(GenTree** use, GenTree* user) + { + m_nodeCount++; + return WALK_CONTINUE; + } + + void Reset() + { + m_nodeCount = 0; + } + unsigned Cost() + { + return m_nodeCount; + } + }; + + TreeCostWalker costWalker(this); + + loop->VisitLoopBlocks([&](BasicBlock* block) { + weight_t normalizedWeight = block->getBBWeight(this); + for (Statement* const stmt : block->Statements()) + { + costWalker.Reset(); + costWalker.WalkTree(stmt->GetRootNodePointer(), nullptr); + loopSize += costWalker.Cost(); + } + return BasicBlockVisit::Continue; + }); + + int const sizeLimit = JitConfig.JitCloneLoopsSizeLimit(); + + if ((sizeLimit >= 0) && (loopSize >= (unsigned)sizeLimit)) + { + JITDUMP("Loop cloning: rejecting loop " FMT_LP " of size %u, size limit %d\n", loop->GetIndex(), loopSize, + sizeLimit); + return false; + } + + return true; +} + //---------------------------------------------------------------------------- // optIsLoopClonable: Determine whether this loop can be cloned. // @@ -2563,7 +2651,7 @@ Compiler::fgWalkResult Compiler::optCanOptimizeByLoopCloning(GenTree* tree, Loop assert(compCurBB->lastStmt() == info->stmt); info->context->EnsureLoopOptInfo(info->loop->GetIndex()) - ->Push(new (this, CMK_LoopOpt) LcTypeTestOptInfo(info->stmt, indir, lclNum, clsHnd)); + ->Push(new (this, CMK_LoopOpt) LcTypeTestOptInfo(compCurBB, info->stmt, indir, lclNum, clsHnd)); } } else if (optIsHandleOrIndirOfHandle(relopOp2, GTF_ICON_FTN_ADDR)) @@ -2644,7 +2732,7 @@ Compiler::fgWalkResult Compiler::optCanOptimizeByLoopCloning(GenTree* tree, Loop assert(iconHandle->IsIconHandle(GTF_ICON_FTN_ADDR)); assert(compCurBB->lastStmt() == info->stmt); LcMethodAddrTestOptInfo* optInfo = new (this, CMK_LoopOpt) - LcMethodAddrTestOptInfo(info->stmt, indir, lclNum, (void*)iconHandle->IconValue(), + LcMethodAddrTestOptInfo(compCurBB, info->stmt, indir, lclNum, (void*)iconHandle->IconValue(), relopOp2 != iconHandle DEBUG_ARG( (CORINFO_METHOD_HANDLE)iconHandle->gtTargetHandle)); info->context->EnsureLoopOptInfo(info->loop->GetIndex())->Push(optInfo); @@ -2944,6 +3032,10 @@ PhaseStatus Compiler::optCloneLoops() // No need to clone. context.CancelLoopOptInfo(loop->GetIndex()); } + else if (!optShouldCloneLoop(loop, &context)) + { + context.CancelLoopOptInfo(loop->GetIndex()); + } } } diff --git a/src/coreclr/jit/loopcloning.h b/src/coreclr/jit/loopcloning.h index 20f041eab40a5..cfda1be87a8b9 100644 --- a/src/coreclr/jit/loopcloning.h +++ b/src/coreclr/jit/loopcloning.h @@ -321,6 +321,8 @@ struct LcJaggedArrayOptInfo : public LcOptInfo // struct LcTypeTestOptInfo : public LcOptInfo { + // block where statement occurs + BasicBlock* block; // statement where the opportunity occurs Statement* stmt; // indir for the method table @@ -330,8 +332,13 @@ struct LcTypeTestOptInfo : public LcOptInfo // handle being tested for CORINFO_CLASS_HANDLE clsHnd; - LcTypeTestOptInfo(Statement* stmt, GenTreeIndir* methodTableIndir, unsigned lclNum, CORINFO_CLASS_HANDLE clsHnd) + LcTypeTestOptInfo(BasicBlock* block, + Statement* stmt, + GenTreeIndir* methodTableIndir, + unsigned lclNum, + CORINFO_CLASS_HANDLE clsHnd) : LcOptInfo(LcTypeTest) + , block(block) , stmt(stmt) , methodTableIndir(methodTableIndir) , lclNum(lclNum) @@ -342,6 +349,8 @@ struct LcTypeTestOptInfo : public LcOptInfo struct LcMethodAddrTestOptInfo : public LcOptInfo { + // block where statement occurs + BasicBlock* block; // statement where the opportunity occurs Statement* stmt; // indir on the delegate @@ -355,12 +364,14 @@ struct LcMethodAddrTestOptInfo : public LcOptInfo CORINFO_METHOD_HANDLE targetMethHnd; #endif - LcMethodAddrTestOptInfo(Statement* stmt, + LcMethodAddrTestOptInfo(BasicBlock* block, + Statement* stmt, GenTreeIndir* delegateAddressIndir, unsigned delegateLclNum, void* methAddr, bool isSlot DEBUG_ARG(CORINFO_METHOD_HANDLE targetMethHnd)) : LcOptInfo(LcMethodAddrTest) + , block(block) , stmt(stmt) , delegateAddressIndir(delegateAddressIndir) , delegateLclNum(delegateLclNum)