-
Notifications
You must be signed in to change notification settings - Fork 5.2k
A few optimizations for the gcinfodecoder construction #96150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
bdde1b7
12b491b
9f4c0c3
43875b6
7bd79a6
e443e82
a7ca3a5
14ef9e1
05b269b
5106b83
902ed79
6ffcdaf
53e5237
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -265,6 +265,7 @@ class BitStreamReader | |
|
||
m_pCurrent = m_pBuffer = dac_cast<PTR_size_t>((size_t)dac_cast<TADDR>(pBuffer) & ~((size_t)sizeof(size_t)-1)); | ||
m_RelPos = m_InitialRelPos = (int)((size_t)dac_cast<TADDR>(pBuffer) % sizeof(size_t)) * 8/*BITS_PER_BYTE*/; | ||
m_current = *m_pCurrent >> m_RelPos; | ||
} | ||
|
||
BitStreamReader(const BitStreamReader& other) | ||
|
@@ -275,6 +276,7 @@ class BitStreamReader | |
m_InitialRelPos = other.m_InitialRelPos; | ||
m_pCurrent = other.m_pCurrent; | ||
m_RelPos = other.m_RelPos; | ||
m_current = other.m_current; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On 64bit one native word can act as a "buffer" for quite a few reads when each read takes only a few bits. This change reduces the need for indirect reads from the bitstream and may allow the compiler to enregister the "buffer". |
||
} | ||
|
||
const BitStreamReader& operator=(const BitStreamReader& other) | ||
|
@@ -285,6 +287,7 @@ class BitStreamReader | |
m_InitialRelPos = other.m_InitialRelPos; | ||
m_pCurrent = other.m_pCurrent; | ||
m_RelPos = other.m_RelPos; | ||
m_current = other.m_current; | ||
return *this; | ||
} | ||
|
||
|
@@ -295,33 +298,35 @@ class BitStreamReader | |
|
||
_ASSERTE(numBits > 0 && numBits <= BITS_PER_SIZE_T); | ||
|
||
size_t result = (*m_pCurrent) >> m_RelPos; | ||
size_t result = m_current; | ||
m_current >>= numBits; | ||
int newRelPos = m_RelPos + numBits; | ||
if(newRelPos >= BITS_PER_SIZE_T) | ||
{ | ||
m_pCurrent++; | ||
m_current = *m_pCurrent; | ||
newRelPos -= BITS_PER_SIZE_T; | ||
if(newRelPos > 0) | ||
{ | ||
size_t extraBits = (*m_pCurrent) << (numBits - newRelPos); | ||
result ^= extraBits; | ||
} | ||
size_t extraBits = m_current << (numBits - newRelPos); | ||
result |= extraBits; | ||
m_current >>= newRelPos; | ||
} | ||
m_RelPos = newRelPos; | ||
result &= SAFE_SHIFT_LEFT(1, numBits) - 1; | ||
result &= ((size_t)-1 >> (BITS_PER_SIZE_T - numBits)); | ||
jkotas marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return result; | ||
} | ||
|
||
// This version reads one bit, returning zero/non-zero (not 0/1) | ||
// This version reads one bit | ||
// NOTE: This routine is perf-critical | ||
__forceinline size_t ReadOneFast() | ||
{ | ||
SUPPORTS_DAC; | ||
|
||
size_t result = (*m_pCurrent) & (((size_t)1) << m_RelPos); | ||
size_t result = m_current & 1; | ||
m_current >>= 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The point of this change is to use a fixed-size shift, which is typically faster than a variable-sized shift. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but you pay for it by extra memory writes. Is it really a win at the end? For example, here are timings for Intel 11th gen from
Constant shift of memory is twice the cost of variable shift of register. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will have to recheck the original codegen, but I think what was happening is that we would do indirect read and then apply a mask that was constructed via a variable shift of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interestingly for Zen3 the table gives no difference whatsoever between immediate and CL shift versions.
yet profiler finds CL shift operations relatively expensive. Maybe it is not the instruction itself, but the whole sequence of dependent instructions - load 1, load CL, make a mask, do a read, apply the mask, and sampling profiler just attributes most of that to the shift. The code does get measurably faster after the change. I had a few other changes that did not improve anything so I did not include them, but this one definitely helped. c++ inlines and interleaves statement parts quite aggressively, so it is a bit hard to see what belongs where, but I see that a few "expensive" CL shifts are gone. for(UINT32 i = 0; i < numSlots; i++)
{
if(m_Reader.ReadOneFast())
numCouldBeLiveSlots++;
} It is not an example of expensive/hot code, just something that is easier to read and see how codegen changed. ==== original code 00007FF700CE9C4E test r13d,r13d
00007FF700CE9C51 je GcInfoDecoder::EnumerateLiveSlots+898h (07FF700CE9C98h)
00007FF700CE9C53 mov r8d,r13d
00007FF700CE9C56 nop word ptr [rax+rax]
00007FF700CE9C60 mov ecx,r15d
00007FF700CE9C63 mov rax,r11 ; r11 has `1` in it, assigned outside of the loop
00007FF700CE9C66 shl rax,cl ; depends on 2 instructions above
00007FF700CE9C69 and rax,qword ptr [rdx] ; depends on instruction above + read (L1 is 3-5 cycles)
00007FF700CE9C6C inc r15d
00007FF700CE9C6F mov dword ptr [rdi+18h],r15d
00007FF700CE9C73 cmp r15d,40h
00007FF700CE9C77 jne GcInfoDecoder::EnumerateLiveSlots+88Bh (07FF700CE9C8Bh)
00007FF700CE9C79 add rdx,8 ; moving to the next word, relatively rare (1 in 64 reads)
00007FF700CE9C7D mov dword ptr [rdi+18h],0
00007FF700CE9C84 mov qword ptr [rdi+10h],rdx
00007FF700CE9C88 xor r15d,r15d
00007FF700CE9C8B test rax,rax
00007FF700CE9C8E je GcInfoDecoder::EnumerateLiveSlots+893h (07FF700CE9C93h)
00007FF700CE9C90 inc r12d
00007FF700CE9C93 sub r8,r11 vs. ==== new 00007FF688D6A454 test r12d,r12d
00007FF688D6A457 je GcInfoDecoder::EnumerateLiveSlots+0C04h (07FF688D6A4B4h)
00007FF688D6A459 mov rax,r11
00007FF688D6A45C mov edx,r12d
00007FF688D6A45F nop
00007FF688D6A460 mov rcx,rax
00007FF688D6A463 inc r8d ; can run together or even before the previous instruction
00007FF688D6A466 shr rax,1 ; this and the next can run at the same time
00007FF688D6A469 and ecx,1 ; both only depend on "mov rcx,rax"
00007FF688D6A46C mov qword ptr [rbx+20h],rax ; we do writes, but we do not need to wait for them
00007FF688D6A470 mov dword ptr [rbx+18h],r8d
00007FF688D6A474 mov qword ptr [rsp+48h],rax ; not sure what is stored here, in other cases we have just 2 writes
00007FF688D6A479 cmp r8d,40h
00007FF688D6A47D jne GcInfoDecoder::EnumerateLiveSlots+0BEBh (07FF688D6A49Bh)
00007FF688D6A47F add qword ptr [rbx+10h],8 ; moving to the next word, relatively rare (1 in 64 reads)
00007FF688D6A484 mov rax,qword ptr [rbx+10h]
00007FF688D6A488 xor r8d,r8d
00007FF688D6A48B mov rax,qword ptr [rax]
00007FF688D6A48E mov qword ptr [rbx+20h],rax
00007FF688D6A492 mov qword ptr [rsp+48h],rax
00007FF688D6A497 mov dword ptr [rbx+18h],r8d
00007FF688D6A49B test rcx,rcx
00007FF688D6A49E je GcInfoDecoder::EnumerateLiveSlots+0BF3h (07FF688D6A4A3h)
00007FF688D6A4A0 inc r13d
00007FF688D6A4A3 sub rdx,1 |
||
if(++m_RelPos == BITS_PER_SIZE_T) | ||
{ | ||
m_pCurrent++; | ||
m_current = *m_pCurrent; | ||
m_RelPos = 0; | ||
} | ||
return result; | ||
|
@@ -339,6 +344,7 @@ class BitStreamReader | |
size_t adjPos = pos + m_InitialRelPos; | ||
m_pCurrent = m_pBuffer + adjPos / BITS_PER_SIZE_T; | ||
m_RelPos = (int)(adjPos % BITS_PER_SIZE_T); | ||
m_current = *m_pCurrent >> m_RelPos; | ||
_ASSERTE(GetCurrentPos() == pos); | ||
} | ||
|
||
|
@@ -349,19 +355,6 @@ class BitStreamReader | |
SetCurrentPos(GetCurrentPos() + numBitsToSkip); | ||
} | ||
|
||
__forceinline void AlignUpToByte() | ||
{ | ||
if(m_RelPos <= BITS_PER_SIZE_T - 8) | ||
{ | ||
m_RelPos = (m_RelPos + 7) & ~7; | ||
} | ||
else | ||
{ | ||
m_RelPos = 0; | ||
m_pCurrent++; | ||
} | ||
} | ||
|
||
__forceinline size_t ReadBitAtPos( size_t pos ) | ||
{ | ||
size_t adjPos = pos + m_InitialRelPos; | ||
|
@@ -422,6 +415,7 @@ class BitStreamReader | |
int m_InitialRelPos; | ||
PTR_size_t m_pCurrent; | ||
int m_RelPos; | ||
size_t m_current; | ||
}; | ||
|
||
struct GcSlotDesc | ||
|
@@ -565,15 +559,8 @@ class GcInfoDecoder | |
UINT32 m_InstructionOffset; | ||
|
||
// Pre-decoded information | ||
GcInfoHeaderFlags m_headerFlags; | ||
bool m_IsInterruptible; | ||
bool m_IsVarArg; | ||
bool m_GenericSecretParamIsMD; | ||
bool m_GenericSecretParamIsMT; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are derived from masking the header flags. Masking is cheap and we may not even be asked for these, so we can do masking in the accessors. |
||
#ifdef TARGET_AMD64 | ||
bool m_WantsReportOnlyLeaf; | ||
#elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) | ||
bool m_HasTailCalls; | ||
#endif // TARGET_AMD64 | ||
INT32 m_GSCookieStackSlot; | ||
INT32 m_ReversePInvokeFrameStackSlot; | ||
UINT32 m_ValidRangeStart; | ||
|
@@ -590,7 +577,8 @@ class GcInfoDecoder | |
#ifdef PARTIALLY_INTERRUPTIBLE_GC_SUPPORTED | ||
UINT32 m_NumSafePoints; | ||
UINT32 m_SafePointIndex; | ||
UINT32 FindSafePoint(UINT32 codeOffset); | ||
UINT32 NarrowSafePointSearch(size_t savedPos, UINT32 breakOffset, UINT32* searchEnd); | ||
UINT32 FindSafePoint(UINT32 codeOffset); | ||
#endif | ||
UINT32 m_NumInterruptibleRanges; | ||
|
||
|
@@ -604,6 +592,8 @@ class GcInfoDecoder | |
#endif | ||
UINT32 m_Version; | ||
|
||
bool PredecodeFatHeader(int remainingFlags); | ||
|
||
static bool SetIsInterruptibleCB (UINT32 startOffset, UINT32 stopOffset, void * hCallback); | ||
|
||
OBJECTREF* GetRegisterSlot( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,10 @@ | |
#include "gcinfo.h" | ||
#endif | ||
|
||
#ifdef _MSC_VER | ||
#include <intrin.h> | ||
#endif // _MSC_VER | ||
|
||
// ***************************************************************************** | ||
// WARNING!!!: These values and code are also used by SOS in the diagnostics | ||
// repo. Should updated in a backwards and forwards compatible way. | ||
|
@@ -43,14 +47,28 @@ __forceinline size_t SAFE_SHIFT_RIGHT(size_t x, size_t count) | |
|
||
inline UINT32 CeilOfLog2(size_t x) | ||
{ | ||
// we want lzcnt, but bsr is ok too | ||
_ASSERTE(x > 0); | ||
UINT32 result = (x & (x - 1)) ? 1 : 0; | ||
while (x != 1) | ||
{ | ||
result++; | ||
x >>= 1; | ||
} | ||
return result; | ||
|
||
x = (x << 1) - 1; | ||
|
||
#ifdef TARGET_64BIT | ||
#ifdef _MSC_VER | ||
DWORD lzcountCeil; | ||
_BitScanReverse64(&lzcountCeil, (unsigned long)x); | ||
#else // _MSC_VER | ||
UINT32 lzcountCeil = (UINT32)__builtin_clzl((unsigned long)x); | ||
#endif // _MSC_VER | ||
#else // TARGET_64BIT | ||
#ifdef _MSC_VER | ||
DWORD lzcountCeil; | ||
_BitScanReverse(&lzcountCeil, (unsigned long)x); | ||
#else // _MSC_VER | ||
UINT32 lzcountCeil = (UINT32)__builtin_clz((unsigned int)x); | ||
|
||
#endif // _MSC_VER | ||
#endif | ||
|
||
return BITS_PER_SIZE_T - lzcountCeil; | ||
} | ||
|
||
enum GcSlotFlags | ||
|
Uh oh!
There was an error while loading. Please reload this page.