@@ -6193,7 +6193,9 @@ namespace IGC
6193
6193
6194
6194
uint32_t barrierCount = jitInfo->numBarriers ;
6195
6195
uint32_t privateMemPerThread = (uint32_t )(it.second .argumentStackSize + it.second .allocaStackSize );
6196
- uint32_t spillMemPerThread = getSpillMemSizeWithFG (*F, jitInfo->stats .spillMemUsed , pFga);
6196
+ uint32_t spillMemPerThread =
6197
+ getSpillMemSizeWithFG (*F, jitInfo->stats .spillMemUsed , pFga,
6198
+ jitInfo->numBytesScratchGtpin );
6197
6199
uint8_t hasRTCalls = (uint8_t )modMD->FuncMD [F].hasSyncRTCalls ;
6198
6200
6199
6201
attrs.emplace_back (
@@ -6727,7 +6729,8 @@ namespace IGC
6727
6729
IGC_IS_FLAG_SET (ForceScratchSpaceSize)
6728
6730
? IGC_GET_FLAG_VALUE (ForceScratchSpaceSize)
6729
6731
: getSpillMemSizeWithFG (*m_program->entry ,
6730
- jitInfo->stats .spillMemUsed , pFGA);
6732
+ jitInfo->stats .spillMemUsed , pFGA,
6733
+ jitInfo->numBytesScratchGtpin );
6731
6734
6732
6735
pMainKernel->GetGTPinBuffer (pOutput->m_gtpinBuffer ,
6733
6736
pOutput->m_gtpinBufferSize ,
@@ -6756,23 +6759,35 @@ namespace IGC
6756
6759
}
6757
6760
6758
6761
uint32_t CEncoder::getSpillMemSizeWithFG (const llvm::Function &curFunc,
6759
- uint32_t curSize, GenXFunctionGroupAnalysis *fga)
6760
- {
6762
+ uint32_t curSize,
6763
+ GenXFunctionGroupAnalysis *fga,
6764
+ uint32_t gtpinScratchUse) {
6761
6765
if (!fga)
6762
6766
return curSize;
6763
6767
6764
6768
// Return the precise stack size for non-group-head function, and the
6765
6769
// estimated conservative value for group head.
6766
6770
const FunctionGroup *fg = fga->getGroupForHead (&curFunc);
6767
- if (!fg)
6771
+ if (!fg)
6768
6772
return curSize;
6769
- // Since it is difficult to predict amount of space needed to store stack,
6770
- // we reserve a magic large size. Reserving max PTSS is ideal, but it can
6771
- // lead to OOM on machines with large number of threads.
6773
+ // Since it is difficult to predict amount of space needed to store
6774
+ // stack, we reserve a magic large size. Reserving max PTSS is
6775
+ // ideal, but it can lead to OOM on machines with large number of
6776
+ // threads.
6772
6777
auto visaplt = GetVISAPlatform (&(m_program->GetContext ()->platform ));
6773
6778
if (fg->hasIndirectCall () || fg->hasRecursion ()) {
6774
- if (visaplt == TARGET_PLATFORM::Xe_PVCXT)
6779
+ if (visaplt == TARGET_PLATFORM::Xe_PVCXT)
6775
6780
return 64 * 1024 ;
6781
+ // gtpin may want to use 8kb for instrumentation. HWord
6782
+ // scratch message supports only 128kb addressing. Whereas
6783
+ // OWord message for 128kb+ addressing requires free GRF
6784
+ // for header. If there are no free GRFs then instrumentation
6785
+ // may fail. So for TGLLP, we assume 120kb is used for spills
6786
+ // so that gtpin may still be able to use 8kb, when needed.
6787
+ // LSC doesn't have restriction of 128kb so this WA is only
6788
+ // used for TGLLP and only when gtpin wants to attach.
6789
+ if (visaplt == TARGET_PLATFORM::GENX_TGLLP && gtpinScratchUse)
6790
+ return 120 * 1024 ;
6776
6791
return 128 * 1024 ;
6777
6792
}
6778
6793
0 commit comments