This repository was archived by the owner on Jan 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Add OperationStatus-based UTF8 transcoding APIs #23219
Merged
GrabYourPitchforks
merged 2 commits into
dotnet:master
from
GrabYourPitchforks:utf8_transcode_simple_1
Mar 13, 2019
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
198 changes: 198 additions & 0 deletions
198
src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System.Buffers; | ||
using System.Diagnostics; | ||
|
||
namespace System.Text.Unicode | ||
{ | ||
public static class Utf8 | ||
{ | ||
/* | ||
* OperationStatus-based APIs for transcoding of chunked data. | ||
* This method is similar to Encoding.UTF8.GetBytes / GetChars but has a | ||
* different calling convention, different error handling mechanisms, and | ||
* different performance characteristics. | ||
* | ||
* If 'replaceInvalidSequences' is true, the method will replace any ill-formed | ||
* subsequence in the source with U+FFFD when transcoding to the destination, | ||
* then it will continue processing the remainder of the buffers. Otherwise | ||
* the method will return OperationStatus.InvalidData. | ||
* | ||
* If the method does return an error code, the out parameters will represent | ||
* how much of the data was successfully transcoded, and the location of the | ||
* ill-formed subsequence can be deduced from these values. | ||
* | ||
* If 'replaceInvalidSequences' is true, the method is guaranteed never to return | ||
* OperationStatus.InvalidData. If 'isFinalBlock' is true, the method is | ||
* guaranteed never to return OperationStatus.NeedMoreData. | ||
*/ | ||
|
||
/// <summary> | ||
/// Transcodes the UTF-16 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-8. | ||
/// </summary> | ||
/// <remarks> | ||
/// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-16 sequences | ||
/// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and | ||
/// this method will not return <see cref="OperationStatus.InvalidData"/>. | ||
/// </remarks> | ||
public static OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) | ||
{ | ||
int originalSourceLength = source.Length; | ||
int originalDestinationLength = destination.Length; | ||
OperationStatus status = OperationStatus.Done; | ||
|
||
// In a loop, this is going to read and transcode one scalar value at a time | ||
// from the source to the destination. | ||
|
||
while (!source.IsEmpty) | ||
{ | ||
status = Rune.DecodeUtf16(source, out Rune firstScalarValue, out int charsConsumed); | ||
|
||
switch (status) | ||
GrabYourPitchforks marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
case OperationStatus.NeedMoreData: | ||
|
||
// Input buffer ended with a high surrogate. Only treat this as an error | ||
// if the caller told us that we shouldn't expect additional data in a | ||
// future call. | ||
|
||
if (!isFinalBlock) | ||
{ | ||
goto Finish; | ||
} | ||
|
||
status = OperationStatus.InvalidData; | ||
goto case OperationStatus.InvalidData; | ||
|
||
case OperationStatus.InvalidData: | ||
|
||
// Input buffer contained invalid data. If the caller told us not to | ||
// perform U+FFFD replacement, terminate the loop immediately and return | ||
// an error to the caller. | ||
|
||
if (!replaceInvalidSequences) | ||
{ | ||
goto Finish; | ||
} | ||
|
||
firstScalarValue = Rune.ReplacementChar; | ||
goto default; | ||
|
||
default: | ||
|
||
// We know which scalar value we need to transcode to UTF-8. | ||
// Do so now, and only terminate the loop if we ran out of space | ||
// in the destination buffer. | ||
|
||
if (firstScalarValue.TryEncodeToUtf8Bytes(destination, out int bytesWritten)) | ||
{ | ||
source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution | ||
destination = destination.Slice(bytesWritten); | ||
status = OperationStatus.Done; // forcibly set success | ||
GrabYourPitchforks marked this conversation as resolved.
Show resolved
Hide resolved
|
||
continue; | ||
} | ||
else | ||
{ | ||
status = OperationStatus.DestinationTooSmall; | ||
goto Finish; | ||
} | ||
} | ||
} | ||
|
||
Finish: | ||
|
||
numCharsRead = originalSourceLength - source.Length; | ||
numBytesWritten = originalDestinationLength - destination.Length; | ||
|
||
Debug.Assert(numCharsRead < originalSourceLength || status != OperationStatus.Done, | ||
"Cannot report OperationStatus.Done if we haven't consumed the entire input buffer."); | ||
|
||
return status; | ||
} | ||
|
||
/// <summary> | ||
/// Transcodes the UTF-8 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-16. | ||
/// </summary> | ||
/// <remarks> | ||
/// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-8 sequences | ||
/// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and | ||
/// this method will not return <see cref="OperationStatus.InvalidData"/>. | ||
/// </remarks> | ||
public static OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) | ||
{ | ||
int originalSourceLength = source.Length; | ||
int originalDestinationLength = destination.Length; | ||
OperationStatus status = OperationStatus.Done; | ||
|
||
// In a loop, this is going to read and transcode one scalar value at a time | ||
// from the source to the destination. | ||
|
||
while (!source.IsEmpty) | ||
{ | ||
status = Rune.DecodeUtf8(source, out Rune firstScalarValue, out int bytesConsumed); | ||
|
||
switch (status) | ||
{ | ||
case OperationStatus.NeedMoreData: | ||
|
||
// Input buffer ended with a partial UTF-8 sequence. Only treat this as an error | ||
// if the caller told us that we shouldn't expect additional data in a | ||
// future call. | ||
|
||
if (!isFinalBlock) | ||
{ | ||
goto Finish; | ||
} | ||
|
||
status = OperationStatus.InvalidData; | ||
goto case OperationStatus.InvalidData; | ||
|
||
case OperationStatus.InvalidData: | ||
|
||
// Input buffer contained invalid data. If the caller told us not to | ||
// perform U+FFFD replacement, terminate the loop immediately and return | ||
// an error to the caller. | ||
|
||
if (!replaceInvalidSequences) | ||
{ | ||
goto Finish; | ||
} | ||
|
||
firstScalarValue = Rune.ReplacementChar; | ||
goto default; | ||
|
||
default: | ||
|
||
// We know which scalar value we need to transcode to UTF-16. | ||
// Do so now, and only terminate the loop if we ran out of space | ||
// in the destination buffer. | ||
|
||
if (firstScalarValue.TryEncode(destination, out int charsWritten)) | ||
{ | ||
source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution | ||
destination = destination.Slice(charsWritten); | ||
status = OperationStatus.Done; // forcibly set success | ||
continue; | ||
} | ||
else | ||
{ | ||
status = OperationStatus.DestinationTooSmall; | ||
goto Finish; | ||
} | ||
} | ||
} | ||
|
||
Finish: | ||
|
||
numBytesRead = originalSourceLength - source.Length; | ||
numCharsWritten = originalDestinationLength - destination.Length; | ||
|
||
Debug.Assert(numBytesRead < originalSourceLength || status != OperationStatus.Done, | ||
"Cannot report OperationStatus.Done if we haven't consumed the entire input buffer."); | ||
|
||
return status; | ||
} | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.