Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 3661584

Browse files
authored
Workaround memset alignment sensitivity (#24302)
* Workaround memset alignment sensitivity memset is up to 2x slower on misaligned block on some types of hardware. The problem is uneven performance of "rep stosb" used to implement the memset in some cases. The exact matrix on when it is slower and by how much is very complex. This change workarounds the issue by aligning the memory block before it is passed to memset and filling in the potential misaligned part manually. This workaround will regress performance by a few percent (<10%) in some cases, but we will gain up to 2x improvement in other cases. Fixes #24300
1 parent 54af92b commit 3661584

File tree

2 files changed

+25
-1
lines changed

2 files changed

+25
-1
lines changed

src/System.Private.CoreLib/shared/System/SpanHelpers.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ public static unsafe void ClearWithoutReferences(ref byte b, nuint byteLength)
2424
return;
2525

2626
#if CORECLR && (AMD64 || ARM64)
27-
if (byteLength > 4096)
27+
// The exact matrix on when RhZeroMemory is faster than InitBlockUnaligned is very complex. The factors to consider include
28+
// type of hardware and memory aligment. This threshold was chosen as a good balance accross different configurations.
29+
if (byteLength > 768)
2830
goto PInvoke;
2931
Unsafe.InitBlockUnaligned(ref b, 0, (uint)byteLength);
3032
return;

src/vm/comutilnative.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,28 @@ void QCALLTYPE MemoryNative::Clear(void *dst, size_t length)
815815
{
816816
QCALL_CONTRACT;
817817

818+
#if defined(_X86_) || defined(_AMD64_)
819+
if (length > 0x100)
820+
{
821+
// memset ends up calling rep stosb if the hardware claims to support it efficiently. rep stosb is up to 2x slower
822+
// on misaligned blocks. Workaround this issue by aligning the blocks passed to memset upfront.
823+
824+
*(uint64_t*)dst = 0;
825+
*((uint64_t*)dst + 1) = 0;
826+
*((uint64_t*)dst + 2) = 0;
827+
*((uint64_t*)dst + 3) = 0;
828+
829+
void* end = (uint8_t*)dst + length;
830+
*((uint64_t*)end - 1) = 0;
831+
*((uint64_t*)end - 2) = 0;
832+
*((uint64_t*)end - 3) = 0;
833+
*((uint64_t*)end - 4) = 0;
834+
835+
dst = ALIGN_UP((uint8_t*)dst + 1, 32);
836+
length = ALIGN_DOWN((uint8_t*)end - 1, 32) - (uint8_t*)dst;
837+
}
838+
#endif
839+
818840
memset(dst, 0, length);
819841
}
820842

0 commit comments

Comments
 (0)