Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Add OperationStatus-based UTF8 transcoding APIs #23219

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />
Expand Down
198 changes: 198 additions & 0 deletions src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Buffers;
using System.Diagnostics;

namespace System.Text.Unicode
{
public static class Utf8
{
/*
* OperationStatus-based APIs for transcoding of chunked data.
* This method is similar to Encoding.UTF8.GetBytes / GetChars but has a
* different calling convention, different error handling mechanisms, and
* different performance characteristics.
*
* If 'replaceInvalidSequences' is true, the method will replace any ill-formed
* subsequence in the source with U+FFFD when transcoding to the destination,
* then it will continue processing the remainder of the buffers. Otherwise
* the method will return OperationStatus.InvalidData.
*
* If the method does return an error code, the out parameters will represent
* how much of the data was successfully transcoded, and the location of the
* ill-formed subsequence can be deduced from these values.
*
* If 'replaceInvalidSequences' is true, the method is guaranteed never to return
* OperationStatus.InvalidData. If 'isFinalBlock' is true, the method is
* guaranteed never to return OperationStatus.NeedMoreData.
*/

/// <summary>
/// Transcodes the UTF-16 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-8.
/// </summary>
/// <remarks>
/// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-16 sequences
/// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
/// this method will not return <see cref="OperationStatus.InvalidData"/>.
/// </remarks>
public static OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
{
int originalSourceLength = source.Length;
int originalDestinationLength = destination.Length;
OperationStatus status = OperationStatus.Done;

// In a loop, this is going to read and transcode one scalar value at a time
// from the source to the destination.

while (!source.IsEmpty)
{
status = Rune.DecodeUtf16(source, out Rune firstScalarValue, out int charsConsumed);

switch (status)
{
case OperationStatus.NeedMoreData:

// Input buffer ended with a high surrogate. Only treat this as an error
// if the caller told us that we shouldn't expect additional data in a
// future call.

if (!isFinalBlock)
{
goto Finish;
}

status = OperationStatus.InvalidData;
goto case OperationStatus.InvalidData;

case OperationStatus.InvalidData:

// Input buffer contained invalid data. If the caller told us not to
// perform U+FFFD replacement, terminate the loop immediately and return
// an error to the caller.

if (!replaceInvalidSequences)
{
goto Finish;
}

firstScalarValue = Rune.ReplacementChar;
goto default;

default:

// We know which scalar value we need to transcode to UTF-8.
// Do so now, and only terminate the loop if we ran out of space
// in the destination buffer.

if (firstScalarValue.TryEncodeToUtf8Bytes(destination, out int bytesWritten))
{
source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution
destination = destination.Slice(bytesWritten);
status = OperationStatus.Done; // forcibly set success
continue;
}
else
{
status = OperationStatus.DestinationTooSmall;
goto Finish;
}
}
}

Finish:

numCharsRead = originalSourceLength - source.Length;
numBytesWritten = originalDestinationLength - destination.Length;

Debug.Assert(numCharsRead < originalSourceLength || status != OperationStatus.Done,
"Cannot report OperationStatus.Done if we haven't consumed the entire input buffer.");

return status;
}

/// <summary>
/// Transcodes the UTF-8 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-16.
/// </summary>
/// <remarks>
/// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-8 sequences
/// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
/// this method will not return <see cref="OperationStatus.InvalidData"/>.
/// </remarks>
public static OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
{
int originalSourceLength = source.Length;
int originalDestinationLength = destination.Length;
OperationStatus status = OperationStatus.Done;

// In a loop, this is going to read and transcode one scalar value at a time
// from the source to the destination.

while (!source.IsEmpty)
{
status = Rune.DecodeUtf8(source, out Rune firstScalarValue, out int bytesConsumed);

switch (status)
{
case OperationStatus.NeedMoreData:

// Input buffer ended with a partial UTF-8 sequence. Only treat this as an error
// if the caller told us that we shouldn't expect additional data in a
// future call.

if (!isFinalBlock)
{
goto Finish;
}

status = OperationStatus.InvalidData;
goto case OperationStatus.InvalidData;

case OperationStatus.InvalidData:

// Input buffer contained invalid data. If the caller told us not to
// perform U+FFFD replacement, terminate the loop immediately and return
// an error to the caller.

if (!replaceInvalidSequences)
{
goto Finish;
}

firstScalarValue = Rune.ReplacementChar;
goto default;

default:

// We know which scalar value we need to transcode to UTF-16.
// Do so now, and only terminate the loop if we ran out of space
// in the destination buffer.

if (firstScalarValue.TryEncode(destination, out int charsWritten))
{
source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution
destination = destination.Slice(charsWritten);
status = OperationStatus.Done; // forcibly set success
continue;
}
else
{
status = OperationStatus.DestinationTooSmall;
goto Finish;
}
}
}

Finish:

numBytesRead = originalSourceLength - source.Length;
numCharsWritten = originalDestinationLength - destination.Length;

Debug.Assert(numBytesRead < originalSourceLength || status != OperationStatus.Done,
"Cannot report OperationStatus.Done if we haven't consumed the entire input buffer.");

return status;
}
}
}