Skip to content

Commit b4ba5da

Browse files
authored
A few optimizations for the gcinfodecoder construction (#96150)
* faster ReadOneFast * lzcnt * linear search for safepoints * do not "decode" individual bool flags * factored out predecoding of fat header * tweaks and comments * fix arm64 build * use intrinsics correctly * no need to search for safepoints when asked only for interruptibility * fix for arm * small tweak for DecodeVarLengthUnsigned * removed SAFE_SHIFT_LEFT, SAFE_SHIFT_RIGHT * undo unnecessary change.
1 parent 1fa3f1d commit b4ba5da

File tree

4 files changed

+258
-193
lines changed

4 files changed

+258
-193
lines changed

src/coreclr/inc/gcinfodecoder.h

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ class BitStreamReader
265265

266266
m_pCurrent = m_pBuffer = dac_cast<PTR_size_t>((size_t)dac_cast<TADDR>(pBuffer) & ~((size_t)sizeof(size_t)-1));
267267
m_RelPos = m_InitialRelPos = (int)((size_t)dac_cast<TADDR>(pBuffer) % sizeof(size_t)) * 8/*BITS_PER_BYTE*/;
268+
m_current = *m_pCurrent >> m_RelPos;
268269
}
269270

270271
BitStreamReader(const BitStreamReader& other)
@@ -275,6 +276,7 @@ class BitStreamReader
275276
m_InitialRelPos = other.m_InitialRelPos;
276277
m_pCurrent = other.m_pCurrent;
277278
m_RelPos = other.m_RelPos;
279+
m_current = other.m_current;
278280
}
279281

280282
const BitStreamReader& operator=(const BitStreamReader& other)
@@ -285,6 +287,7 @@ class BitStreamReader
285287
m_InitialRelPos = other.m_InitialRelPos;
286288
m_pCurrent = other.m_pCurrent;
287289
m_RelPos = other.m_RelPos;
290+
m_current = other.m_current;
288291
return *this;
289292
}
290293

@@ -295,33 +298,35 @@ class BitStreamReader
295298

296299
_ASSERTE(numBits > 0 && numBits <= BITS_PER_SIZE_T);
297300

298-
size_t result = (*m_pCurrent) >> m_RelPos;
301+
size_t result = m_current;
302+
m_current >>= numBits;
299303
int newRelPos = m_RelPos + numBits;
300304
if(newRelPos >= BITS_PER_SIZE_T)
301305
{
302306
m_pCurrent++;
307+
m_current = *m_pCurrent;
303308
newRelPos -= BITS_PER_SIZE_T;
304-
if(newRelPos > 0)
305-
{
306-
size_t extraBits = (*m_pCurrent) << (numBits - newRelPos);
307-
result ^= extraBits;
308-
}
309+
size_t extraBits = m_current << (numBits - newRelPos);
310+
result |= extraBits;
311+
m_current >>= newRelPos;
309312
}
310313
m_RelPos = newRelPos;
311-
result &= SAFE_SHIFT_LEFT(1, numBits) - 1;
314+
result &= ((size_t)-1 >> (BITS_PER_SIZE_T - numBits));
312315
return result;
313316
}
314317

315-
// This version reads one bit, returning zero/non-zero (not 0/1)
318+
// This version reads one bit
316319
// NOTE: This routine is perf-critical
317320
__forceinline size_t ReadOneFast()
318321
{
319322
SUPPORTS_DAC;
320323

321-
size_t result = (*m_pCurrent) & (((size_t)1) << m_RelPos);
324+
size_t result = m_current & 1;
325+
m_current >>= 1;
322326
if(++m_RelPos == BITS_PER_SIZE_T)
323327
{
324328
m_pCurrent++;
329+
m_current = *m_pCurrent;
325330
m_RelPos = 0;
326331
}
327332
return result;
@@ -339,6 +344,7 @@ class BitStreamReader
339344
size_t adjPos = pos + m_InitialRelPos;
340345
m_pCurrent = m_pBuffer + adjPos / BITS_PER_SIZE_T;
341346
m_RelPos = (int)(adjPos % BITS_PER_SIZE_T);
347+
m_current = *m_pCurrent >> m_RelPos;
342348
_ASSERTE(GetCurrentPos() == pos);
343349
}
344350

@@ -349,19 +355,6 @@ class BitStreamReader
349355
SetCurrentPos(GetCurrentPos() + numBitsToSkip);
350356
}
351357

352-
__forceinline void AlignUpToByte()
353-
{
354-
if(m_RelPos <= BITS_PER_SIZE_T - 8)
355-
{
356-
m_RelPos = (m_RelPos + 7) & ~7;
357-
}
358-
else
359-
{
360-
m_RelPos = 0;
361-
m_pCurrent++;
362-
}
363-
}
364-
365358
__forceinline size_t ReadBitAtPos( size_t pos )
366359
{
367360
size_t adjPos = pos + m_InitialRelPos;
@@ -376,17 +369,17 @@ class BitStreamReader
376369
// See the corresponding methods on BitStreamWriter for more information on the format
377370
//--------------------------------------------------------------------------
378371

379-
inline size_t DecodeVarLengthUnsigned( int base )
372+
size_t DecodeVarLengthUnsignedMore ( int base )
380373
{
381374
_ASSERTE((base > 0) && (base < (int)BITS_PER_SIZE_T));
382375
size_t numEncodings = size_t{ 1 } << base;
383-
size_t result = 0;
384-
for(int shift=0; ; shift+=base)
376+
size_t result = numEncodings;
377+
for(int shift=base; ; shift+=base)
385378
{
386379
_ASSERTE(shift+base <= (int)BITS_PER_SIZE_T);
387380

388381
size_t currentChunk = Read(base+1);
389-
result |= (currentChunk & (numEncodings-1)) << shift;
382+
result ^= (currentChunk & (numEncodings-1)) << shift;
390383
if(!(currentChunk & numEncodings))
391384
{
392385
// Extension bit is not set, we're done.
@@ -395,6 +388,19 @@ class BitStreamReader
395388
}
396389
}
397390

391+
__forceinline size_t DecodeVarLengthUnsigned(int base)
392+
{
393+
_ASSERTE((base > 0) && (base < (int)BITS_PER_SIZE_T));
394+
395+
size_t result = Read(base + 1);
396+
if (result & ((size_t)1 << base))
397+
{
398+
result ^= DecodeVarLengthUnsignedMore(base);
399+
}
400+
401+
return result;
402+
}
403+
398404
inline SSIZE_T DecodeVarLengthSigned( int base )
399405
{
400406
_ASSERTE((base > 0) && (base < (int)BITS_PER_SIZE_T));
@@ -422,6 +428,7 @@ class BitStreamReader
422428
int m_InitialRelPos;
423429
PTR_size_t m_pCurrent;
424430
int m_RelPos;
431+
size_t m_current;
425432
};
426433

427434
struct GcSlotDesc
@@ -565,15 +572,8 @@ class GcInfoDecoder
565572
UINT32 m_InstructionOffset;
566573

567574
// Pre-decoded information
575+
GcInfoHeaderFlags m_headerFlags;
568576
bool m_IsInterruptible;
569-
bool m_IsVarArg;
570-
bool m_GenericSecretParamIsMD;
571-
bool m_GenericSecretParamIsMT;
572-
#ifdef TARGET_AMD64
573-
bool m_WantsReportOnlyLeaf;
574-
#elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
575-
bool m_HasTailCalls;
576-
#endif // TARGET_AMD64
577577
INT32 m_GSCookieStackSlot;
578578
INT32 m_ReversePInvokeFrameStackSlot;
579579
UINT32 m_ValidRangeStart;
@@ -590,7 +590,8 @@ class GcInfoDecoder
590590
#ifdef PARTIALLY_INTERRUPTIBLE_GC_SUPPORTED
591591
UINT32 m_NumSafePoints;
592592
UINT32 m_SafePointIndex;
593-
UINT32 FindSafePoint(UINT32 codeOffset);
593+
UINT32 NarrowSafePointSearch(size_t savedPos, UINT32 breakOffset, UINT32* searchEnd);
594+
UINT32 FindSafePoint(UINT32 codeOffset);
594595
#endif
595596
UINT32 m_NumInterruptibleRanges;
596597

@@ -604,6 +605,8 @@ class GcInfoDecoder
604605
#endif
605606
UINT32 m_Version;
606607

608+
bool PredecodeFatHeader(int remainingFlags);
609+
607610
static bool SetIsInterruptibleCB (UINT32 startOffset, UINT32 stopOffset, void * hCallback);
608611

609612
OBJECTREF* GetRegisterSlot(

src/coreclr/inc/gcinfoencoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ class BitStreamWriter
285285
// Writes bits knowing that they will all fit in the current memory slot
286286
inline void WriteInCurrentSlot( size_t data, UINT32 count )
287287
{
288-
data &= SAFE_SHIFT_LEFT(1, count) - 1;
288+
data &= ((size_t)-1 >> (BITS_PER_SIZE_T - count));
289289
data <<= (BITS_PER_SIZE_T - m_FreeBitsInCurrentSlot);
290290
*m_pCurrentSlot |= data;
291291
}

src/coreclr/inc/gcinfotypes.h

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
#include "gcinfo.h"
1010
#endif
1111

12+
#ifdef _MSC_VER
13+
#include <intrin.h>
14+
#endif // _MSC_VER
15+
1216
// *****************************************************************************
1317
// WARNING!!!: These values and code are also used by SOS in the diagnostics
1418
// repo. Should updated in a backwards and forwards compatible way.
@@ -22,35 +26,33 @@
2226

2327
#define BITS_PER_SIZE_T ((int)sizeof(size_t)*8)
2428

25-
26-
//--------------------------------------------------------------------------------
27-
// It turns out, that ((size_t)x) << y == x, when y is not a literal
28-
// and its value is BITS_PER_SIZE_T
29-
// I guess the processor only shifts of the right operand modulo BITS_PER_SIZE_T
30-
// In many cases, we want the above operation to yield 0,
31-
// hence the following macros
32-
//--------------------------------------------------------------------------------
33-
__forceinline size_t SAFE_SHIFT_LEFT(size_t x, size_t count)
34-
{
35-
_ASSERTE(count <= BITS_PER_SIZE_T);
36-
return (x << 1) << (count - 1);
37-
}
38-
__forceinline size_t SAFE_SHIFT_RIGHT(size_t x, size_t count)
39-
{
40-
_ASSERTE(count <= BITS_PER_SIZE_T);
41-
return (x >> 1) >> (count - 1);
42-
}
43-
4429
inline UINT32 CeilOfLog2(size_t x)
4530
{
31+
// it is ok to use bsr or clz unconditionally
4632
_ASSERTE(x > 0);
47-
UINT32 result = (x & (x - 1)) ? 1 : 0;
48-
while (x != 1)
49-
{
50-
result++;
51-
x >>= 1;
52-
}
53-
return result;
33+
34+
x = (x << 1) - 1;
35+
36+
#ifdef TARGET_64BIT
37+
#ifdef _MSC_VER
38+
DWORD result;
39+
_BitScanReverse64(&result, (unsigned long)x);
40+
return (UINT32)result;
41+
#else // _MSC_VER
42+
// LZCNT returns index starting from MSB, whereas BSR gives the index from LSB.
43+
// 63 ^ LZCNT here is equivalent to 63 - LZCNT since the LZCNT result is always between 0 and 63.
44+
// This saves an instruction, as subtraction from constant requires either MOV/SUB or NEG/ADD.
45+
return (UINT32)63 ^ (UINT32)__builtin_clzl((unsigned long)x);
46+
#endif // _MSC_VER
47+
#else // TARGET_64BIT
48+
#ifdef _MSC_VER
49+
DWORD result;
50+
_BitScanReverse(&result, (unsigned int)x);
51+
return (UINT32)result;
52+
#else // _MSC_VER
53+
return (UINT32)31 ^ (UINT32)__builtin_clz((unsigned int)x);
54+
#endif // _MSC_VER
55+
#endif
5456
}
5557

5658
enum GcSlotFlags

0 commit comments

Comments
 (0)