Skip to content

Commit 97f5955

Browse files
GrabYourPitchforksjlennox
authored andcommitted
Allow rune enumeration from string and ROS<char> (dotnet/coreclr#21007)
Signed-off-by: dotnet-bot <dotnet-bot@microsoft.com>
1 parent f4aa82a commit 97f5955

File tree

6 files changed

+194
-0
lines changed

6 files changed

+194
-0
lines changed

src/Common/src/CoreLib/System.Private.CoreLib.Shared.projitems

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,8 +652,10 @@
652652
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Latin1Encoding.cs" />
653653
<Compile Include="$(MSBuildThisFileDirectory)System\Text\NormalizationForm.cs" />
654654
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Rune.cs" />
655+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\SpanRuneEnumerator.cs" />
655656
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilder.cs" />
656657
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilder.Debug.cs" Condition="'$(Configuration)' == 'Debug'" />
658+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringRuneEnumerator.cs" />
657659
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeDebug.cs" />
658660
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeEncoding.cs" />
659661
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeUtility.cs" />

src/Common/src/CoreLib/System/MemoryExtensions.cs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Collections.Generic;
66
using System.Runtime.CompilerServices;
77
using System.Runtime.InteropServices;
8+
using System.Text;
89

910
using Internal.Runtime.CompilerServices;
1011

@@ -975,6 +976,28 @@ ref MemoryMarshal.GetReference(value),
975976
valueLength);
976977
}
977978

979+
/// <summary>
980+
/// Returns an enumeration of <see cref="Rune"/> from the provided span.
981+
/// </summary>
982+
/// <remarks>
983+
/// Invalid sequences will be represented in the enumeration by <see cref="Rune.ReplacementChar"/>.
984+
/// </remarks>
985+
public static SpanRuneEnumerator EnumerateRunes(this ReadOnlySpan<char> span)
986+
{
987+
return new SpanRuneEnumerator(span);
988+
}
989+
990+
/// <summary>
991+
/// Returns an enumeration of <see cref="Rune"/> from the provided span.
992+
/// </summary>
993+
/// <remarks>
994+
/// Invalid sequences will be represented in the enumeration by <see cref="Rune.ReplacementChar"/>.
995+
/// </remarks>
996+
public static SpanRuneEnumerator EnumerateRunes(this Span<char> span)
997+
{
998+
return new SpanRuneEnumerator(span);
999+
}
1000+
9781001
/// <summary>
9791002
/// Reverses the sequence of the elements in the entire span.
9801003
/// </summary>

src/Common/src/CoreLib/System/String.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,17 @@ IEnumerator IEnumerable.GetEnumerator()
532532
return new CharEnumerator(this);
533533
}
534534

535+
/// <summary>
536+
/// Returns an enumeration of <see cref="Rune"/> from this string.
537+
/// </summary>
538+
/// <remarks>
539+
/// Invalid sequences will be represented in the enumeration by <see cref="Rune.ReplacementChar"/>.
540+
/// </remarks>
541+
public StringRuneEnumerator EnumerateRunes()
542+
{
543+
return new StringRuneEnumerator(this);
544+
}
545+
535546
internal static unsafe int wcslen(char* ptr)
536547
{
537548
char* end = ptr;

src/Common/src/CoreLib/System/Text/Rune.cs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,43 @@ public static Rune GetRuneAt(string input, int index)
249249
[CLSCompliant(false)]
250250
public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value);
251251

252+
// returns a negative number on failure
253+
internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan<char> input)
254+
{
255+
if (input.IsEmpty)
256+
{
257+
return -1;
258+
}
259+
260+
// Optimistically assume input is within BMP.
261+
262+
uint returnValue = input[0];
263+
if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
264+
{
265+
if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
266+
{
267+
return -1;
268+
}
269+
270+
// Treat 'returnValue' as the high surrogate.
271+
272+
if (1 >= (uint)input.Length)
273+
{
274+
return -1; // not an argument exception - just a "bad data" failure
275+
}
276+
277+
uint potentialLowSurrogate = input[1];
278+
if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
279+
{
280+
return -1;
281+
}
282+
283+
returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
284+
}
285+
286+
return (int)returnValue;
287+
}
288+
252289
// returns a negative number on failure
253290
private static int ReadRuneFromString(string input, int index)
254291
{
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
namespace System.Text
6+
{
7+
// An enumerator for retrieving System.Text.Rune instances from a ROS<char>.
8+
// Methods are pattern-matched by compiler to allow using foreach pattern.
9+
public ref struct SpanRuneEnumerator
10+
{
11+
private ReadOnlySpan<char> _remaining;
12+
private Rune _current;
13+
14+
internal SpanRuneEnumerator(ReadOnlySpan<char> buffer)
15+
{
16+
_remaining = buffer;
17+
_current = default;
18+
}
19+
20+
public Rune Current => _current;
21+
22+
public SpanRuneEnumerator GetEnumerator() => this;
23+
24+
public bool MoveNext()
25+
{
26+
if (_remaining.IsEmpty)
27+
{
28+
// reached the end of the buffer
29+
_current = default;
30+
return false;
31+
}
32+
33+
int scalarValue = Rune.ReadFirstRuneFromUtf16Buffer(_remaining);
34+
if (scalarValue < 0)
35+
{
36+
// replace invalid sequences with U+FFFD
37+
scalarValue = Rune.ReplacementChar.Value;
38+
}
39+
40+
// In UTF-16 specifically, invalid sequences always have length 1, which is the same
41+
// length as the replacement character U+FFFD. This means that we can always bump the
42+
// next index by the current scalar's UTF-16 sequence length. This optimization is not
43+
// generally applicable; for example, enumerating scalars from UTF-8 cannot utilize
44+
// this same trick.
45+
46+
_current = Rune.UnsafeCreate((uint)scalarValue);
47+
_remaining = _remaining.Slice(_current.Utf16SequenceLength);
48+
return true;
49+
}
50+
}
51+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System.Collections;
6+
using System.Collections.Generic;
7+
8+
namespace System.Text
9+
{
10+
// An enumerator for retrieving System.Text.Rune instances from a System.String.
11+
public struct StringRuneEnumerator : IEnumerable<Rune>, IEnumerator<Rune>
12+
{
13+
private readonly string _string;
14+
private Rune _current;
15+
private int _nextIndex;
16+
17+
internal StringRuneEnumerator(string value)
18+
{
19+
_string = value;
20+
_current = default;
21+
_nextIndex = 0;
22+
}
23+
24+
public Rune Current => _current;
25+
26+
public StringRuneEnumerator GetEnumerator() => this;
27+
28+
public bool MoveNext()
29+
{
30+
if ((uint)_nextIndex >= _string.Length)
31+
{
32+
// reached the end of the string
33+
_current = default;
34+
return false;
35+
}
36+
37+
if (!Rune.TryGetRuneAt(_string, _nextIndex, out _current))
38+
{
39+
// replace invalid sequences with U+FFFD
40+
_current = Rune.ReplacementChar;
41+
}
42+
43+
// In UTF-16 specifically, invalid sequences always have length 1, which is the same
44+
// length as the replacement character U+FFFD. This means that we can always bump the
45+
// next index by the current scalar's UTF-16 sequence length. This optimization is not
46+
// generally applicable; for example, enumerating scalars from UTF-8 cannot utilize
47+
// this same trick.
48+
49+
_nextIndex += _current.Utf16SequenceLength;
50+
return true;
51+
}
52+
53+
object IEnumerator.Current => _current;
54+
55+
void IDisposable.Dispose()
56+
{
57+
// no-op
58+
}
59+
60+
IEnumerator IEnumerable.GetEnumerator() => this;
61+
62+
IEnumerator<Rune> IEnumerable<Rune>.GetEnumerator() => this;
63+
64+
void IEnumerator.Reset()
65+
{
66+
_current = default;
67+
_nextIndex = 0;
68+
}
69+
}
70+
}

0 commit comments

Comments
 (0)