Skip to content

Commit 51bac63

Browse files
GrabYourPitchforksmarek-safar
authored andcommitted
PR feedback: Clarify 3-byte sequence processing
Signed-off-by: dotnet-bot <dotnet-bot@microsoft.com>
1 parent 5246c85 commit 51bac63

File tree

2 files changed

+21
-6
lines changed

2 files changed

+21
-6
lines changed

netcore/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ internal static partial class Utf8Utility
2020
[MethodImpl(MethodImplOptions.AggressiveInlining)]
2121
private static uint ExtractCharFromFirstThreeByteSequence(uint value)
2222
{
23+
Debug.Assert(UInt32BeginsWithUtf8ThreeByteMask(value));
24+
2325
if (BitConverter.IsLittleEndian)
2426
{
2527
// value = [ ######## | 10xxxxxx 10yyyyyy 1110zzzz ]

netcore/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -493,18 +493,31 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
493493
if (Bmi2.X64.IsSupported)
494494
{
495495
Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
496+
497+
// First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which
498+
// would indicate the potential start of a second three-byte sequence.
499+
496500
if (((thisDWord - 0xE000_0000u) & 0xF000_0000u) == 0)
497501
{
498-
if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 7)
502+
// The const '3' below is correct because pFinalPosWhereCanReadDWordFromInputBuffer represents
503+
// the final place where we can safely perform a DWORD read, and we want to probe whether it's
504+
// safe to read a DWORD beginning at address &pInputBuffer[3].
505+
506+
if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3)
499507
{
500508
// We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT.
501-
502-
uint nextDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 3);
503-
if (((nextDWord & 0x0000_200Fu) != 0) && (((nextDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
509+
// Since we already validated the first byte of the second DWORD (it's the same as the final byte of the
510+
// first DWORD), the only checks that remain are the overlong + surrogate checks. If the overlong or surrogate
511+
// checks fail, we'll fall through to the remainder of the logic which will transcode the original valid
512+
// 3-byte UTF-8 sequence we read; and on the next iteration of the loop the validation routine will run again,
513+
// fail, and redirect control flow to the error handling logic at the very end of this method.
514+
515+
uint secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 3);
516+
if (((secondDWord & 0x0000_200Fu) != 0) && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
504517
{
505518
// combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD
506-
ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(nextDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord);
507-
thisDWord = nextDWord; // store this value in the correct local for the ASCII drain logic
519+
ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord);
520+
thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic
508521

509522
// extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ]
510523
ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul);

0 commit comments

Comments
 (0)