Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 0bd315a

Browse files
GrabYourPitchforksjkotas
authored andcommitted
Add OperationStatus-based UTF8 transcoding APIs (dotnet/coreclr#23219)
Signed-off-by: dotnet-bot <dotnet-bot@microsoft.com>
1 parent f546428 commit 0bd315a

File tree

2 files changed

+199
-0
lines changed

2 files changed

+199
-0
lines changed

src/Common/src/CoreLib/System.Private.CoreLib.Shared.projitems

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,7 @@
798798
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" />
799799
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" />
800800
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
801+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
801802
<Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
802803
<Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
803804
<Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System.Buffers;
6+
using System.Diagnostics;
7+
8+
namespace System.Text.Unicode
9+
{
10+
public static class Utf8
11+
{
12+
/*
13+
* OperationStatus-based APIs for transcoding of chunked data.
14+
* This method is similar to Encoding.UTF8.GetBytes / GetChars but has a
15+
* different calling convention, different error handling mechanisms, and
16+
* different performance characteristics.
17+
*
18+
* If 'replaceInvalidSequences' is true, the method will replace any ill-formed
19+
* subsequence in the source with U+FFFD when transcoding to the destination,
20+
* then it will continue processing the remainder of the buffers. Otherwise
21+
* the method will return OperationStatus.InvalidData.
22+
*
23+
* If the method does return an error code, the out parameters will represent
24+
* how much of the data was successfully transcoded, and the location of the
25+
* ill-formed subsequence can be deduced from these values.
26+
*
27+
* If 'replaceInvalidSequences' is true, the method is guaranteed never to return
28+
* OperationStatus.InvalidData. If 'isFinalBlock' is true, the method is
29+
* guaranteed never to return OperationStatus.NeedMoreData.
30+
*/
31+
32+
/// <summary>
33+
/// Transcodes the UTF-16 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-8.
34+
/// </summary>
35+
/// <remarks>
36+
/// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-16 sequences
37+
/// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
38+
/// this method will not return <see cref="OperationStatus.InvalidData"/>.
39+
/// </remarks>
40+
public static OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
41+
{
42+
int originalSourceLength = source.Length;
43+
int originalDestinationLength = destination.Length;
44+
OperationStatus status = OperationStatus.Done;
45+
46+
// In a loop, this is going to read and transcode one scalar value at a time
47+
// from the source to the destination.
48+
49+
while (!source.IsEmpty)
50+
{
51+
status = Rune.DecodeUtf16(source, out Rune firstScalarValue, out int charsConsumed);
52+
53+
switch (status)
54+
{
55+
case OperationStatus.NeedMoreData:
56+
57+
// Input buffer ended with a high surrogate. Only treat this as an error
58+
// if the caller told us that we shouldn't expect additional data in a
59+
// future call.
60+
61+
if (!isFinalBlock)
62+
{
63+
goto Finish;
64+
}
65+
66+
status = OperationStatus.InvalidData;
67+
goto case OperationStatus.InvalidData;
68+
69+
case OperationStatus.InvalidData:
70+
71+
// Input buffer contained invalid data. If the caller told us not to
72+
// perform U+FFFD replacement, terminate the loop immediately and return
73+
// an error to the caller.
74+
75+
if (!replaceInvalidSequences)
76+
{
77+
goto Finish;
78+
}
79+
80+
firstScalarValue = Rune.ReplacementChar;
81+
goto default;
82+
83+
default:
84+
85+
// We know which scalar value we need to transcode to UTF-8.
86+
// Do so now, and only terminate the loop if we ran out of space
87+
// in the destination buffer.
88+
89+
if (firstScalarValue.TryEncodeToUtf8Bytes(destination, out int bytesWritten))
90+
{
91+
source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution
92+
destination = destination.Slice(bytesWritten);
93+
status = OperationStatus.Done; // forcibly set success
94+
continue;
95+
}
96+
else
97+
{
98+
status = OperationStatus.DestinationTooSmall;
99+
goto Finish;
100+
}
101+
}
102+
}
103+
104+
Finish:
105+
106+
numCharsRead = originalSourceLength - source.Length;
107+
numBytesWritten = originalDestinationLength - destination.Length;
108+
109+
Debug.Assert(numCharsRead < originalSourceLength || status != OperationStatus.Done,
110+
"Cannot report OperationStatus.Done if we haven't consumed the entire input buffer.");
111+
112+
return status;
113+
}
114+
115+
/// <summary>
116+
/// Transcodes the UTF-8 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-16.
117+
/// </summary>
118+
/// <remarks>
119+
/// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-8 sequences
120+
/// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
121+
/// this method will not return <see cref="OperationStatus.InvalidData"/>.
122+
/// </remarks>
123+
public static OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
124+
{
125+
int originalSourceLength = source.Length;
126+
int originalDestinationLength = destination.Length;
127+
OperationStatus status = OperationStatus.Done;
128+
129+
// In a loop, this is going to read and transcode one scalar value at a time
130+
// from the source to the destination.
131+
132+
while (!source.IsEmpty)
133+
{
134+
status = Rune.DecodeUtf8(source, out Rune firstScalarValue, out int bytesConsumed);
135+
136+
switch (status)
137+
{
138+
case OperationStatus.NeedMoreData:
139+
140+
// Input buffer ended with a partial UTF-8 sequence. Only treat this as an error
141+
// if the caller told us that we shouldn't expect additional data in a
142+
// future call.
143+
144+
if (!isFinalBlock)
145+
{
146+
goto Finish;
147+
}
148+
149+
status = OperationStatus.InvalidData;
150+
goto case OperationStatus.InvalidData;
151+
152+
case OperationStatus.InvalidData:
153+
154+
// Input buffer contained invalid data. If the caller told us not to
155+
// perform U+FFFD replacement, terminate the loop immediately and return
156+
// an error to the caller.
157+
158+
if (!replaceInvalidSequences)
159+
{
160+
goto Finish;
161+
}
162+
163+
firstScalarValue = Rune.ReplacementChar;
164+
goto default;
165+
166+
default:
167+
168+
// We know which scalar value we need to transcode to UTF-16.
169+
// Do so now, and only terminate the loop if we ran out of space
170+
// in the destination buffer.
171+
172+
if (firstScalarValue.TryEncode(destination, out int charsWritten))
173+
{
174+
source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution
175+
destination = destination.Slice(charsWritten);
176+
status = OperationStatus.Done; // forcibly set success
177+
continue;
178+
}
179+
else
180+
{
181+
status = OperationStatus.DestinationTooSmall;
182+
goto Finish;
183+
}
184+
}
185+
}
186+
187+
Finish:
188+
189+
numBytesRead = originalSourceLength - source.Length;
190+
numCharsWritten = originalDestinationLength - destination.Length;
191+
192+
Debug.Assert(numBytesRead < originalSourceLength || status != OperationStatus.Done,
193+
"Cannot report OperationStatus.Done if we haven't consumed the entire input buffer.");
194+
195+
return status;
196+
}
197+
}
198+
}

0 commit comments

Comments
 (0)