Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 585180f

Browse files
Add basic UTF-8 validation APIs
1 parent 06ab66c commit 585180f

File tree

5 files changed

+1128
-0
lines changed

5 files changed

+1128
-0
lines changed

src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,10 @@
732732
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF32Encoding.cs" />
733733
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" />
734734
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" />
735+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Utf8Utility.cs" />
736+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Utf8Utility.Validation.cs" />
735737
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
738+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
736739
<Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
737740
<Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
738741
<Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System.Buffers;
6+
7+
namespace System.Text.Unicode
8+
{
9+
/// <summary>
10+
/// Provides facilities for inspecting, transcoding, and manipulating UTF-8 data.
11+
/// </summary>
12+
public static class Utf8
13+
{
14+
/// <summary>
15+
/// Determines whether <paramref name="source"/> represents a well-formed UTF-8 sequence.
16+
/// </summary>
17+
/// <returns>
18+
/// <see langword="true"/> if the sequence is well-formed UTF-8; <see langword="false"/> otherwise.
19+
/// </returns>
20+
/// <remarks>
21+
/// Returns <see langword="true"/> if given an empty input.
22+
/// </remarks>
23+
public static bool IsWellFormed(ReadOnlySpan<byte> source)
24+
{
25+
return Utf8Utility.IsWellFormedSequence(source);
26+
}
27+
28+
/// <summary>
29+
/// Returns the index of the first byte in <paramref name="source"/> that represents the start of an
30+
/// invalid UTF-8 subsequence, along with the UTF-16 code unit count and <see cref="Rune"/> count of
31+
/// the sequence.
32+
/// </summary>
33+
/// <returns>
34+
/// A non-negative integer representing the index of the first byte in <paramref name="source"/> that
35+
/// begins an invalid UTF-8 subsequence, or -1 if <paramref name="source"/> is well-formed.
36+
/// </returns>
37+
/// <remarks>
38+
/// <paramref name="utf16CharCount"/> and <paramref name="runeCount"/> represent the UTF-16 code unit count
39+
/// and the <see cref="Rune"/> count from the beginning of the <paramref name="source"/> buffer up until
40+
/// the reported first invalid subsequence. If <paramref name="source"/> is well-formed, <paramref name="utf16CharCount"/>
41+
/// and <paramref name="runeCount"/> represent the respective counts for the entire buffer.
42+
/// </remarks>
43+
public static int GetIndexOfFirstInvalidByte(ReadOnlySpan<byte> source, out int utf16CharCount, out int runeCount)
44+
{
45+
return Utf8Utility.GetIndexOfFirstInvalidSubsequence(source, out utf16CharCount, out runeCount);
46+
}
47+
}
48+
}

src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5+
using System.Diagnostics;
56
using System.Runtime.CompilerServices;
67

78
namespace System.Text
@@ -155,6 +156,38 @@ public static int GetUtf8SequenceLength(uint value)
155156
[MethodImpl(MethodImplOptions.AggressiveInlining)]
156157
public static bool IsSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDFFFU);
157158

159+
/// <summary>
160+
/// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte;
161+
/// i.e., has binary representation 10xxxxxx, where x is any bit.
162+
/// </summary>
163+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
164+
public static bool IsUtf8ContinuationByte(in byte value)
165+
{
166+
// This API takes its input as a readonly ref so that the JITter can emit "cmp ModRM" statements
167+
// directly rather than bounce a temporary through a register. That is, we want the JIT to be
168+
// able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location
169+
// to see if it's a continuation byte. Data that's already enregistered will go through the
170+
// normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions.
171+
172+
// The below check takes advantage of the two's complement representation of negative numbers.
173+
// [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]
174+
175+
return ((sbyte)value < -64);
176+
}
177+
178+
/// <summary>
179+
/// Returns <see langword="true"/> iff <paramref name="value"/> is the 32-bit expansion of a UTF-8
180+
/// continuation byte; i.e., is in the range 0x80 to 0xBF, inclusive.
181+
/// </summary>
182+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
183+
public static bool IsUtf8ContinuationByte(uint value)
184+
{
185+
// TODO: This should really be using a single 8-bit cmp instruction rather than a 32-bit lea, cmp.
186+
187+
Debug.Assert(value <= byte.MaxValue);
188+
return IsInRangeInclusive(value, 0x80U, 0xBFU);
189+
}
190+
158191
/// <summary>
159192
/// Returns <see langword="true"/> iff <paramref name="codePoint"/> is a valid Unicode code
160193
/// point, i.e., is in [ U+0000..U+10FFFF ], inclusive.

0 commit comments

Comments
 (0)