This repository has been archived by the owner on Jan 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4.9k
Add Span<T> Base64 conversion APIs that support UTF-8 #24888
Merged
Merged
Changes from 3 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
8698f4f
Add Span<T> Base64 conversion APIs that support UTF-8.
ahsonkhan 9c5a70a
Optimize the encoding loop when there is plenty of available space
ahsonkhan 4829b19
Optimize EncodeInPlace and update DecodeBaseline perf test.
ahsonkhan 61bcec9
Addressing PR feedback, encode optimization, throw for negative lengths
ahsonkhan bebb7fc
Reenable commented out perf tests.
ahsonkhan 93c44b2
Cap the amount of data to process based on how much that will fit.
ahsonkhan a9065ae
Being explicit with access modifiers to follow guidelines.
ahsonkhan File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
namespace System.Buffers | ||
{ | ||
/// <summary> | ||
/// This enum defines the various potential status that can be returned from Span-based operations | ||
/// that support processing of input contained in multiple discontiguous buffers. | ||
/// </summary> | ||
public enum OperationStatus | ||
{ | ||
/// <summary> | ||
/// The entire input buffer has been processed and the operation is complete. | ||
/// </summary> | ||
Done, | ||
/// <summary> | ||
/// The input is partially processed, up to what could fit into the destination buffer. | ||
/// The caller can enlarge the destination buffer, slice the buffers appropriately, and retry. | ||
/// </summary> | ||
DestinationTooSmall, | ||
/// <summary> | ||
/// The input is partially processed, up to the last valid chunk of the input that could be consumed. | ||
/// The caller can stitch the remaining unprocessed input with more data, slice the buffers appropriately, and retry. | ||
/// </summary> | ||
NeedMoreData, | ||
/// <summary> | ||
/// The input contained invalid bytes which could not be processed. If the input is partially processed, | ||
/// the destination contains the partial result. This guarantees that no additional data appended to the input | ||
/// will make the invalid sequence valid. | ||
/// </summary> | ||
InvalidData, | ||
} | ||
} |
301 changes: 301 additions & 0 deletions
301
src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,301 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System.Buffers; | ||
using System.Diagnostics; | ||
using System.Runtime.CompilerServices; | ||
|
||
namespace System.Buffers.Text | ||
{ | ||
public static partial class Base64 | ||
{ | ||
/// <summary> | ||
/// Decode the span of UTF-8 encoded text represented as base 64 into binary data. | ||
/// If the input is not a multiple of 4, it will decode as much as it can, to the closest multiple of 4. | ||
/// | ||
/// <param name="utf8">The input span which contains UTF-8 encoded text in base 64 that needs to be decoded.</param> | ||
/// <param name="bytes">The output span which contains the result of the operation, i.e. the decoded binary data.</param> | ||
/// <param name="consumed">The number of input bytes consumed during the operation. This can be used to slice the input for subsequent calls, if necessary.</param> | ||
/// <param name="written">The number of bytes written into the output span. This can be used to slice the output for subsequent calls, if necessary.</param> | ||
/// <param name="isFinalBlock">True (default) when the input span contains the entire data to decode. | ||
/// Set to false only if it is known that the input span contains partial data with more data to follow.</param> | ||
/// <returns>It returns the OperationStatus enum values: | ||
/// - Done - on successful processing of the entire input span | ||
/// - DestinationTooSmall - if there is not enough space in the output span to fit the decoded input | ||
/// - NeedMoreData - only if isFinalBlock is false and the input is not a multiple of 4, otherwise the partial input would be considered as InvalidData | ||
/// - InvalidData - if the input contains bytes outside of the expected base 64 range, or if it contains invalid/more than two padding characters, | ||
/// or if the input is incomplete (i.e. not a multiple of 4) and isFinalBlock is true.</returns> | ||
/// </summary> | ||
public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Span<byte> bytes, out int consumed, out int written, bool isFinalBlock = true) | ||
{ | ||
ref byte srcBytes = ref utf8.DangerousGetPinnableReference(); | ||
ref byte destBytes = ref bytes.DangerousGetPinnableReference(); | ||
|
||
int srcLength = utf8.Length & ~0x3; // only decode input up to the closest multiple of 4. | ||
int destLength = bytes.Length; | ||
|
||
int sourceIndex = 0; | ||
int destIndex = 0; | ||
|
||
if (utf8.Length == 0) goto DoneExit; | ||
|
||
ref sbyte decodingMap = ref s_decodingMap[0]; | ||
|
||
// Last bytes could have padding characters, so process them separately and treat them as valid only if isFinalBlock is true | ||
// if isFinalBlock is false, padding characters are considered invalid | ||
int skipLastChunk = isFinalBlock ? 4 : 0; | ||
|
||
while (sourceIndex < srcLength - skipLastChunk) | ||
{ | ||
int result = Decode(ref Unsafe.Add(ref srcBytes, sourceIndex), ref decodingMap); | ||
if (result < 0) goto InvalidExit; | ||
if (destIndex > destLength - 3) goto DestinationSmallExit; | ||
WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, destIndex), result); | ||
destIndex += 3; | ||
sourceIndex += 4; | ||
} | ||
|
||
// If input is less than 4 bytes, srcLength == sourceIndex == 0 | ||
// If input is not a multiple of 4, sourceIndex == srcLength != 0 | ||
if (sourceIndex == srcLength) | ||
{ | ||
if (isFinalBlock) goto InvalidExit; | ||
goto NeedMoreExit; | ||
} | ||
|
||
// if isFinalBlock is false, we will never reach this point | ||
|
||
int i0 = Unsafe.Add(ref srcBytes, srcLength - 4); | ||
int i1 = Unsafe.Add(ref srcBytes, srcLength - 3); | ||
int i2 = Unsafe.Add(ref srcBytes, srcLength - 2); | ||
int i3 = Unsafe.Add(ref srcBytes, srcLength - 1); | ||
|
||
i0 = Unsafe.Add(ref decodingMap, i0); | ||
i1 = Unsafe.Add(ref decodingMap, i1); | ||
|
||
i0 <<= 18; | ||
i1 <<= 12; | ||
|
||
i0 |= i1; | ||
|
||
if (i3 != s_encodingPad) | ||
{ | ||
i2 = Unsafe.Add(ref decodingMap, i2); | ||
i3 = Unsafe.Add(ref decodingMap, i3); | ||
|
||
i2 <<= 6; | ||
|
||
i0 |= i3; | ||
i0 |= i2; | ||
|
||
if (i0 < 0) goto InvalidExit; | ||
if (destIndex > destLength - 3) goto DestinationSmallExit; | ||
WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, destIndex), i0); | ||
destIndex += 3; | ||
} | ||
else if (i2 != s_encodingPad) | ||
{ | ||
i2 = Unsafe.Add(ref decodingMap, i2); | ||
|
||
i2 <<= 6; | ||
|
||
i0 |= i2; | ||
|
||
if (i0 < 0) goto InvalidExit; | ||
if (destIndex > destLength - 2) goto DestinationSmallExit; | ||
Unsafe.Add(ref destBytes, destIndex) = (byte)(i0 >> 16); | ||
Unsafe.Add(ref destBytes, destIndex + 1) = (byte)(i0 >> 8); | ||
destIndex += 2; | ||
} | ||
else | ||
{ | ||
if (i0 < 0) goto InvalidExit; | ||
if (destIndex > destLength - 1) goto DestinationSmallExit; | ||
Unsafe.Add(ref destBytes, destIndex) = (byte)(i0 >> 16); | ||
destIndex += 1; | ||
} | ||
|
||
sourceIndex += 4; | ||
|
||
if (srcLength != utf8.Length) goto InvalidExit; | ||
|
||
DoneExit: | ||
consumed = sourceIndex; | ||
written = destIndex; | ||
return OperationStatus.Done; | ||
|
||
DestinationSmallExit: | ||
if (srcLength != utf8.Length && isFinalBlock) goto InvalidExit; // if input is not a multiple of 4, and there is no more data, return invalid data instead | ||
consumed = sourceIndex; | ||
written = destIndex; | ||
return OperationStatus.DestinationTooSmall; | ||
|
||
NeedMoreExit: | ||
consumed = sourceIndex; | ||
written = destIndex; | ||
return OperationStatus.NeedMoreData; | ||
|
||
InvalidExit: | ||
consumed = sourceIndex; | ||
written = destIndex; | ||
return OperationStatus.InvalidData; | ||
} | ||
|
||
/// <summary> | ||
/// Returns the maximum length (in bytes) of the result if you were to deocde base 64 encoded text within a byte span of size "length". | ||
/// </summary> | ||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public static int GetMaxDecodedFromUtf8Length(int length) | ||
{ | ||
Debug.Assert(length >= 0); | ||
return (length >> 2) * 3; | ||
} | ||
|
||
/// <summary> | ||
/// Decode the span of UTF-8 encoded text in base 64 (in-place) into binary data. | ||
/// The decoded binary output is smaller than the text data contained in the input (the operation deflates the data). | ||
/// If the input is not a multiple of 4, it will not decode any. | ||
/// | ||
/// <param name="buffer">The input span which contains the base 64 text data that needs to be decoded.</param> | ||
/// <param name="written">The number of bytes written into the buffer.</param> | ||
/// <returns>It returns the OperationStatus enum values: | ||
/// - Done - on successful processing of the entire input span | ||
/// - InvalidData - if the input contains bytes outside of the expected base 64 range, or if it contains invalid/more than two padding characters, | ||
/// or if the input is incomplete (i.e. not a multiple of 4). | ||
/// It does not return DestinationTooSmall since that is not possible for base 64 decoding. | ||
/// It does not return NeedMoreData since this method tramples the data in the buffer and | ||
/// hence can only be called once with all the data in the buffer.</returns> | ||
/// </summary> | ||
public static OperationStatus DecodeFromUtf8InPlace(Span<byte> buffer, out int written) | ||
{ | ||
int bufferLength = buffer.Length; | ||
int sourceIndex = 0; | ||
int destIndex = 0; | ||
|
||
// only decode input if it is a multiple of 4 | ||
if (bufferLength != ((bufferLength >> 2) * 4)) goto InvalidExit; | ||
if (bufferLength == 0) goto DoneExit; | ||
|
||
ref byte bufferBytes = ref buffer.DangerousGetPinnableReference(); | ||
|
||
ref sbyte decodingMap = ref s_decodingMap[0]; | ||
|
||
while (sourceIndex < bufferLength - 4) | ||
{ | ||
int result = Decode(ref Unsafe.Add(ref bufferBytes, sourceIndex), ref decodingMap); | ||
if (result < 0) goto InvalidExit; | ||
WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, destIndex), result); | ||
destIndex += 3; | ||
sourceIndex += 4; | ||
} | ||
|
||
int i0 = Unsafe.Add(ref bufferBytes, bufferLength - 4); | ||
int i1 = Unsafe.Add(ref bufferBytes, bufferLength - 3); | ||
int i2 = Unsafe.Add(ref bufferBytes, bufferLength - 2); | ||
int i3 = Unsafe.Add(ref bufferBytes, bufferLength - 1); | ||
|
||
i0 = Unsafe.Add(ref decodingMap, i0); | ||
i1 = Unsafe.Add(ref decodingMap, i1); | ||
|
||
i0 <<= 18; | ||
i1 <<= 12; | ||
|
||
i0 |= i1; | ||
|
||
if (i3 != s_encodingPad) | ||
{ | ||
i2 = Unsafe.Add(ref decodingMap, i2); | ||
i3 = Unsafe.Add(ref decodingMap, i3); | ||
|
||
i2 <<= 6; | ||
|
||
i0 |= i3; | ||
i0 |= i2; | ||
|
||
if (i0 < 0) goto InvalidExit; | ||
WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, destIndex), i0); | ||
destIndex += 3; | ||
} | ||
else if (i2 != s_encodingPad) | ||
{ | ||
i2 = Unsafe.Add(ref decodingMap, i2); | ||
|
||
i2 <<= 6; | ||
|
||
i0 |= i2; | ||
|
||
if (i0 < 0) goto InvalidExit; | ||
Unsafe.Add(ref bufferBytes, destIndex) = (byte)(i0 >> 16); | ||
Unsafe.Add(ref bufferBytes, destIndex + 1) = (byte)(i0 >> 8); | ||
destIndex += 2; | ||
} | ||
else | ||
{ | ||
if (i0 < 0) goto InvalidExit; | ||
Unsafe.Add(ref bufferBytes, destIndex) = (byte)(i0 >> 16); | ||
destIndex += 1; | ||
} | ||
|
||
DoneExit: | ||
written = destIndex; | ||
return OperationStatus.Done; | ||
|
||
InvalidExit: | ||
written = destIndex; | ||
return OperationStatus.InvalidData; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static int Decode(ref byte encodedBytes, ref sbyte decodingMap) | ||
{ | ||
int i0 = encodedBytes; | ||
int i1 = Unsafe.Add(ref encodedBytes, 1); | ||
int i2 = Unsafe.Add(ref encodedBytes, 2); | ||
int i3 = Unsafe.Add(ref encodedBytes, 3); | ||
|
||
i0 = Unsafe.Add(ref decodingMap, i0); | ||
i1 = Unsafe.Add(ref decodingMap, i1); | ||
i2 = Unsafe.Add(ref decodingMap, i2); | ||
i3 = Unsafe.Add(ref decodingMap, i3); | ||
|
||
i0 <<= 18; | ||
i1 <<= 12; | ||
i2 <<= 6; | ||
|
||
i0 |= i3; | ||
i1 |= i2; | ||
|
||
i0 |= i1; | ||
return i0; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static void WriteThreeLowOrderBytes(ref byte destination, int value) | ||
{ | ||
destination = (byte)(value >> 16); | ||
Unsafe.Add(ref destination, 1) = (byte)(value >> 8); | ||
Unsafe.Add(ref destination, 2) = (byte)value; | ||
} | ||
|
||
// Pre-computing this table using a custom string(s_characters) and GenerateDecodingMapAndVerify (found in tests) | ||
static readonly sbyte[] s_decodingMap = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, //62 is placed at index 43 (for +), 63 at index 47 (for /) | ||
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, //52-61 are placed at index 48-57 (for 0-9), 64 at index 61 (for =) | ||
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, | ||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, //0-25 are placed at index 65-90 (for A-Z) | ||
-1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, | ||
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, //26-51 are placed at index 97-122 (for a-z) | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Bytes over 122 ('z') are invalid and cannot be decoded | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Hence, padding the map with 255, which indicates invalid input | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | ||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | ||
}; | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we like debug assert here? Should we return 0 or throw?