Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit abd7add

Browse files
Add optimized UTF-8 validation and transcoding logic
- Hook it up through the existing Utf8 public static APIs - Move some shared methods out of ASCIIUtility - Hook it up through the Utf8String ctor
1 parent 2520798 commit abd7add

File tree

8 files changed

+3261
-204
lines changed

8 files changed

+3261
-204
lines changed

src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -766,6 +766,7 @@
766766
<Compile Include="$(MSBuildThisFileDirectory)System\SystemException.cs" />
767767
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIEncoding.cs" />
768768
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.cs" />
769+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.Helpers.cs" />
769770
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilderCache.cs" />
770771
<Compile Include="$(MSBuildThisFileDirectory)System\Text\CodePageDataItem.cs" />
771772
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Decoder.cs" />
@@ -804,6 +805,9 @@
804805
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
805806
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
806807
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.cs" />
808+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Helpers.cs" />
809+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Transcoding.cs" />
810+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Validation.cs" />
807811
<Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
808812
<Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
809813
<Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System.Diagnostics;
6+
using System.Numerics;
7+
using System.Runtime.CompilerServices;
8+
using System.Runtime.Intrinsics;
9+
using System.Runtime.Intrinsics.X86;
10+
using Internal.Runtime.CompilerServices;
11+
12+
#if BIT64
13+
using nint = System.Int64;
14+
using nuint = System.UInt64;
15+
#else // BIT64
16+
using nint = System.Int32;
17+
using nuint = System.UInt32;
18+
#endif // BIT64
19+
20+
namespace System.Text
21+
{
22+
internal static partial class ASCIIUtility
23+
{
24+
/// <summary>
25+
/// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
26+
/// </summary>
27+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
28+
internal static bool AllBytesInUInt32AreAscii(uint value)
29+
{
30+
return (value & 0x80808080u) == 0;
31+
}
32+
33+
34+
/// <summary>
35+
/// Given a 24-bit integer which represents a three-byte buffer read in machine endianness,
36+
/// counts the number of consecutive ASCII bytes starting from the beginning of the buffer.
37+
/// Returns a value 0 - 3, inclusive.
38+
/// </summary>
39+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
40+
internal static uint CountNumberOfLeadingAsciiBytesFrom24BitInteger(uint value)
41+
{
42+
// This implementation seems to have better performance than tzcnt.
43+
44+
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
45+
// on whether all processed bytes were ASCII. Then we accumulate all of the
46+
// results to calculate how many consecutive ASCII bytes are present.
47+
48+
value = ~value;
49+
50+
if (BitConverter.IsLittleEndian)
51+
{
52+
// Read first byte
53+
uint allBytesUpToNowAreAscii = (value >>= 7) & 1;
54+
uint numAsciiBytes = allBytesUpToNowAreAscii;
55+
56+
// Read second byte
57+
allBytesUpToNowAreAscii &= (value >>= 8);
58+
numAsciiBytes += allBytesUpToNowAreAscii;
59+
60+
// Read third byte
61+
allBytesUpToNowAreAscii &= (value >>= 8);
62+
numAsciiBytes += allBytesUpToNowAreAscii;
63+
64+
return numAsciiBytes;
65+
}
66+
else
67+
{
68+
// Read first byte
69+
uint allBytesUpToNowAreAscii = (value = ROL32(value, 1)) & 1;
70+
uint numAsciiBytes = allBytesUpToNowAreAscii;
71+
72+
// Read second byte
73+
allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
74+
numAsciiBytes += allBytesUpToNowAreAscii;
75+
76+
// Read third byte
77+
allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
78+
numAsciiBytes += allBytesUpToNowAreAscii;
79+
80+
return numAsciiBytes;
81+
}
82+
}
83+
}
84+
}

src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs

Lines changed: 0 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,6 @@ namespace System.Text
2121
{
2222
internal static partial class ASCIIUtility
2323
{
24-
/// <summary>
25-
/// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
26-
/// </summary>
27-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
28-
private static bool AllBytesInUInt32AreAscii(uint value)
29-
{
30-
return ((value & 0x80808080u) == 0);
31-
}
32-
3324
[MethodImpl(MethodImplOptions.AggressiveInlining)]
3425
private static bool AllBytesInUInt64AreAscii(ulong value)
3526
{
@@ -54,56 +45,6 @@ private static bool AllCharsInUInt64AreAscii(ulong value)
5445
return ((value & ~0x007F007F_007F007Ful) == 0);
5546
}
5647

57-
/// <summary>
58-
/// Given a 24-bit integer which represents a three-byte buffer read in machine endianness,
59-
/// counts the number of consecutive ASCII bytes starting from the beginning of the buffer.
60-
/// Returns a value 0 - 3, inclusive.
61-
/// </summary>
62-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
63-
private static uint CountNumberOfLeadingAsciiBytesFrom24BitInteger(uint value)
64-
{
65-
// This implementation seems to have better performance than tzcnt.
66-
67-
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
68-
// on whether all processed bytes were ASCII. Then we accumulate all of the
69-
// results to calculate how many consecutive ASCII bytes are present.
70-
71-
value = ~value;
72-
73-
if (BitConverter.IsLittleEndian)
74-
{
75-
// Read first byte
76-
uint allBytesUpToNowAreAscii = (value >>= 7) & 1;
77-
uint numAsciiBytes = allBytesUpToNowAreAscii;
78-
79-
// Read second byte
80-
allBytesUpToNowAreAscii &= (value >>= 8);
81-
numAsciiBytes += allBytesUpToNowAreAscii;
82-
83-
// Read third byte
84-
allBytesUpToNowAreAscii &= (value >>= 8);
85-
numAsciiBytes += allBytesUpToNowAreAscii;
86-
87-
return numAsciiBytes;
88-
}
89-
else
90-
{
91-
// Read first byte
92-
uint allBytesUpToNowAreAscii = (value = ROL32(value, 1)) & 1;
93-
uint numAsciiBytes = allBytesUpToNowAreAscii;
94-
95-
// Read second byte
96-
allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
97-
numAsciiBytes += allBytesUpToNowAreAscii;
98-
99-
// Read third byte
100-
allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
101-
numAsciiBytes += allBytesUpToNowAreAscii;
102-
103-
return numAsciiBytes;
104-
}
105-
}
106-
10748
/// <summary>
10849
/// Given a DWORD which represents two packed chars in machine-endian order,
10950
/// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.

0 commit comments

Comments
 (0)