Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 34 additions & 39 deletions src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs
Original file line number Diff line number Diff line change
Expand Up @@ -395,46 +395,38 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> source, out Rune
// it tries to consume as many code units as possible as long as those code
// units constitute the beginning of a longer well-formed subsequence per Table 3-7.

int index = 0;

// Try reading input[0].
// Try reading source[0].

if ((uint)index >= (uint)source.Length)
int index = 0;
if (source.IsEmpty)
{
goto NeedsMoreData;
}

uint tempValue = source[index];
if (!UnicodeUtility.IsAsciiCodePoint(tempValue))
uint tempValue = source[0];
if (UnicodeUtility.IsAsciiCodePoint(tempValue))
{
goto NotAscii;
bytesConsumed = 1;
result = UnsafeCreate(tempValue);
return OperationStatus.Done;
}

Finish:

bytesConsumed = index + 1;
Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
result = UnsafeCreate(tempValue);
return OperationStatus.Done;

NotAscii:

// Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
// the range [C2..F4]. If it's outside of that range, it's either a standalone
// continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
// four-byte sequence.

// Try reading source[1].

index = 1;
if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4))
{
goto FirstByteInvalid;
goto Invalid;
}

tempValue = (tempValue - 0xC2) << 6;

// Try reading input[1].

index++;
if ((uint)index >= (uint)source.Length)
if (source.Length <= 1)
{
goto NeedsMoreData;
}
Expand All @@ -443,7 +435,7 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> source, out Rune
// complement representation is in the range [-65..-128]. This allows us to
// perform a single comparison to see if a byte is a continuation byte.

int thisByteSignExtended = (sbyte)source[index];
int thisByteSignExtended = (sbyte)source[1];
if (thisByteSignExtended >= -64)
{
goto Invalid;
Expand Down Expand Up @@ -485,15 +477,15 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> source, out Rune
// The first two bytes were just fine. We don't need to perform any other checks
// on the remaining bytes other than to see that they're valid continuation bytes.

// Try reading input[2].
// Try reading source[2].

index++;
if ((uint)index >= (uint)source.Length)
index = 2;
if (source.Length <= 2)
{
goto NeedsMoreData;
}

thisByteSignExtended = (sbyte)source[index];
thisByteSignExtended = (sbyte)source[2];
if (thisByteSignExtended >= -64)
{
goto Invalid; // this byte is not a UTF-8 continuation byte
Expand All @@ -510,15 +502,15 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> source, out Rune
goto Finish; // this is a valid 3-byte sequence
}

// Try reading input[3].
// Try reading source[3].

index++;
if ((uint)index >= (uint)source.Length)
index = 3;
if (source.Length <= 3)
{
goto NeedsMoreData;
}

thisByteSignExtended = (sbyte)source[index];
thisByteSignExtended = (sbyte)source[3];
if (thisByteSignExtended >= -64)
{
goto Invalid; // this byte is not a UTF-8 continuation byte
Expand All @@ -529,26 +521,29 @@ public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> source, out Rune
tempValue += 0x80; // remove the continuation byte marker
tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker

// Valid 4-byte sequence
UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue);
goto Finish; // this is a valid 4-byte sequence

FirstByteInvalid:
Finish:

index = 1; // Invalid subsequences are always at least length 1.
bytesConsumed = index + 1;
Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
result = UnsafeCreate(tempValue);
return OperationStatus.Done;

Invalid:
NeedsMoreData:

Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
bytesConsumed = index;
result = ReplacementChar;
return OperationStatus.InvalidData;
return OperationStatus.NeedMoreData;

NeedsMoreData:
Invalid:

Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
bytesConsumed = index;
result = ReplacementChar;
return OperationStatus.NeedMoreData;
return OperationStatus.InvalidData;
}

/// <summary>
Expand Down