Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Escaped UTF-16/32 with UTF-8 surrogates #841

Merged
merged 2 commits into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 2 additions & 9 deletions YamlDotNet.Benchmark/Program.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
using System.Globalization;
using BenchmarkDotNet.Running;
using BenchmarkDotNet.Running;
using YamlDotNet.Benchmark;
using YamlDotNet.Core;
using YamlDotNet.Core.Events;
using YamlDotNet.Serialization;
using YamlDotNet.Serialization.NamingConventions;

var dateTimeOffset = new DateTimeOffset(new DateTime(2017, 1, 2, 3, 4, 5), new TimeSpan(-6, 0, 0));
Console.WriteLine(dateTimeOffset.ToString("MM/dd/yyyy HH:mm:ss zzz", CultureInfo.InvariantCulture));
Console.WriteLine(dateTimeOffset.ToString("O", CultureInfo.InvariantCulture));
BenchmarkSwitcher.FromAssembly(typeof(YamlStreamBenchmark).Assembly).Run(args);
28 changes: 28 additions & 0 deletions YamlDotNet.Test/Core/ScannerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,34 @@ public void Keys_can_start_with_colons_after_double_quoted_values_in_nested_bloc
StreamEnd);
}

[Fact]
public void Utf16StringsAsUtf8SurrogatesWorkCorrectly()
{
AssertSequenceOfTokensFrom(Yaml.ScannerForText("Test: \"\\uD83D\\uDC4D\""),
StreamStart,
BlockMappingStart,
Key,
PlainScalar("Test"),
Value,
DoubleQuotedScalar("\uD83D\uDC4D"), // guaranteed thumbs up emoticon that will work in Windows Terminal since it pukes on displaying it.
BlockEnd,
StreamEnd);
}

[Fact]
public void Utf16CharactersAreReadCorrectly()
{
AssertSequenceOfTokensFrom(Yaml.ScannerForText("Test: \"\uD83D\uDC4D\""),
StreamStart,
BlockMappingStart,
Key,
PlainScalar("Test"),
Value,
DoubleQuotedScalar("\uD83D\uDC4D"), // guaranteed thumbs up emoticon that will work in Windows Terminal since it pukes on displaying it.
BlockEnd,
StreamEnd);
}

private void AssertPartialSequenceOfTokensFrom(Scanner scanner, params Token[] tokens)
{
var tokenNumber = 1;
Expand Down
63 changes: 56 additions & 7 deletions YamlDotNet/Core/Scanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1937,19 +1937,68 @@ private Scalar ScanFlowScalar(bool isSingleQuoted)

// Check the value and write the character.

if ((character >= 0xD800 && character <= 0xDFFF) || character > 0x10FFFF)
//check for utf-8 surrogate pair
if (character >= 0xD800 && character <= 0xDFFF)
{
for (var k = 0; k < codeLength; ++k)
{
Skip();
}

if (analyzer.Peek(0) == '\\' &&
(analyzer.Peek(1) == 'u' || analyzer.Peek(1) == 'U'))
{
Skip(); //escape character
if (analyzer.Peek(0) == 'u')
{
codeLength = 4;
}
else
{
codeLength = 8;
}
Skip(); //escape code

var lowSurrogate = 0;

// Scan the character value.
for (var k = 0; k < codeLength; ++k)
{
if (!analyzer.IsHex(0))
{
throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, did not find expected hexadecimal number.");
}
lowSurrogate = ((lowSurrogate << 4) + analyzer.AsHex(k));
}

for (var k = 0; k < codeLength; ++k)
{
Skip();
}

character = char.ConvertToUtf32((char)character, (char)lowSurrogate);
}
else
{
throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, found invalid Unicode surrogates.");
}
}
else if (character > 0x10FFFF)
{
throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, found invalid Unicode character escape code.");
}
else
{
// Advance the pointer.

value.Append(char.ConvertFromUtf32(character));

// Advance the pointer.
for (var k = 0; k < codeLength; ++k)
{
Skip();
}

for (var k = 0; k < codeLength; ++k)
{
Skip();
}

value.Append(char.ConvertFromUtf32(character));
}
}
else
Expand Down