Skip to content

Commit 1735a8e

Browse files
authored
Merge pull request #219 from mlaily/32bits-escapes
Properly handle 32bits Unicode code points in escape sequences and url-encoded tags
2 parents 42a9f8e + b6e390c commit 1735a8e

File tree

6 files changed

+72
-10
lines changed

6 files changed

+72
-10
lines changed

YamlDotNet.Test/Core/ParserTests.cs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,28 @@ public void VerifyTokenWithMultiDocTag()
388388
StreamEnd);
389389
}
390390

391+
[Fact]
392+
public void VerifyTokenWithUrlEncodedTagContainingPlusSpaces()
393+
{
394+
AssertSequenceOfEventsFrom(Yaml.ParserForText("!(%20%20%20hello+you%20+) value"),
395+
StreamStart,
396+
DocumentStart(Implicit),
397+
PlainScalar("value").T("!( hello you )"),
398+
DocumentEnd(Implicit),
399+
StreamEnd);
400+
}
401+
402+
[Fact]
403+
public void VerifyTokenWithUrlEncoded32BitsUnicodeTags()
404+
{
405+
AssertSequenceOfEventsFrom(Yaml.ParserForText("!hel%F4%8F%BF%BFlo%E2%99%A5+A%20 value"),
406+
StreamStart,
407+
DocumentStart(Implicit),
408+
PlainScalar("value").T("!hel􏿿lo♥ A "),
409+
DocumentEnd(Implicit),
410+
StreamEnd);
411+
}
412+
391413
[Theory]
392414
[InlineData("|\n b-carriage-return,b-line-feed\r\n lll", "b-carriage-return,b-line-feed\nlll")]
393415
[InlineData("|\n b-carriage-return\r lll", "b-carriage-return\nlll")]

YamlDotNet.Test/RepresentationModel/YamlStreamTests.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,12 @@ public void FailBackreference()
174174
RoundtripTest("fail-backreference.yaml");
175175
}
176176

177+
[Fact]
178+
public void Roundtrip32BitsUnicodeEscape()
179+
{
180+
RoundtripTest("unicode-32bits-escape.yaml");
181+
}
182+
177183
[Fact]
178184
public void AllAliasesMustBeResolved()
179185
{

YamlDotNet.Test/YamlDotNet.Test.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@
140140
<EmbeddedResource Include="files\guid.yaml" />
141141
<EmbeddedResource Include="files\ordered-properties.yaml" />
142142
<EmbeddedResource Include="files\multi-doc-tag.yaml" />
143+
<EmbeddedResource Include="files\unicode-32bits-escape.yaml" />
143144
<None Include="packages.config" />
144145
</ItemGroup>
145146
<ItemGroup>
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
- hel􏿿lo♥
2+
- "hel\U0010fffflo\u2665"

YamlDotNet/Core/Emitter.cs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1160,12 +1160,25 @@ private void WriteDoubleQuotedScalar(string value, bool allowBreaks)
11601160
break;
11611161

11621162
default:
1163-
var code = (short)character;
1163+
var code = (ushort)character;
11641164
if (code <= 0xFF)
11651165
{
11661166
Write('x');
11671167
Write(code.ToString("X02", CultureInfo.InvariantCulture));
11681168
}
1169+
else if (IsHighSurrogate(character))
1170+
{
1171+
if (index + 1 < value.Length && IsLowSurrogate(value[index + 1]))
1172+
{
1173+
Write('U');
1174+
Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
1175+
index++;
1176+
}
1177+
else
1178+
{
1179+
throw new SyntaxErrorException("While writing a quoted scalar, found an orphaned high surrogate.");
1180+
}
1181+
}
11691182
else
11701183
{
11711184
Write('u');
@@ -1342,6 +1355,16 @@ private static bool IsPrintable(char character)
13421355
(character >= '\xE000' && character <= '\xFFFD');
13431356
}
13441357

1358+
private static bool IsHighSurrogate(char c)
1359+
{
1360+
return 0xD800 <= c && c <= 0xDBFF;
1361+
}
1362+
1363+
private static bool IsLowSurrogate(char c)
1364+
{
1365+
return 0xDC00 <= c && c <= 0xDFFF;
1366+
}
1367+
13451368
#endregion
13461369

13471370
/// <summary>

YamlDotNet/Core/Scanner.cs

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,7 +1680,7 @@ private Token ScanFlowScalar(bool isSingleQuoted)
16801680

16811681
if (codeLength > 0)
16821682
{
1683-
uint character = 0;
1683+
int character = 0;
16841684

16851685
// Scan the character value.
16861686

@@ -1690,7 +1690,7 @@ private Token ScanFlowScalar(bool isSingleQuoted)
16901690
{
16911691
throw new SyntaxErrorException(start, cursor.Mark(), "While parsing a quoted scalar, did not find expected hexdecimal number.");
16921692
}
1693-
character = (uint)((character << 4) + analyzer.AsHex(k));
1693+
character = ((character << 4) + analyzer.AsHex(k));
16941694
}
16951695

16961696
// Check the value and write the character.
@@ -1700,7 +1700,7 @@ private Token ScanFlowScalar(bool isSingleQuoted)
17001700
throw new SyntaxErrorException(start, cursor.Mark(), "While parsing a quoted scalar, find invalid Unicode character escape code.");
17011701
}
17021702

1703-
value.Append((char)character);
1703+
value.Append(char.ConvertFromUtf32(character));
17041704

17051705
// Advance the pointer.
17061706

@@ -2145,6 +2145,11 @@ private string ScanTagUri(string head, Mark start)
21452145
{
21462146
tag.Append(ScanUriEscapes(start));
21472147
}
2148+
else if (analyzer.Check('+'))
2149+
{
2150+
tag.Append(' ');
2151+
Skip();
2152+
}
21482153
else
21492154
{
21502155
tag.Append(ReadCurrentCharacter());
@@ -2165,11 +2170,12 @@ private string ScanTagUri(string head, Mark start)
21652170
/// Decode an URI-escape sequence corresponding to a single UTF-8 character.
21662171
/// </summary>
21672172

2168-
private char ScanUriEscapes(Mark start)
2173+
private string ScanUriEscapes(Mark start)
21692174
{
21702175
// Decode the required number of characters.
21712176

2172-
var charBytes = new List<byte>();
2177+
byte[] charBytes = null;
2178+
int nextInsertionIndex = 0;
21732179
int width = 0;
21742180
do
21752181
{
@@ -2197,6 +2203,8 @@ private char ScanUriEscapes(Mark start)
21972203
{
21982204
throw new SyntaxErrorException(start, cursor.Mark(), "While parsing a tag, find an incorrect leading UTF-8 octet.");
21992205
}
2206+
2207+
charBytes = new byte[width];
22002208
}
22012209
else
22022210
{
@@ -2210,22 +2218,22 @@ private char ScanUriEscapes(Mark start)
22102218

22112219
// Copy the octet and move the pointers.
22122220

2213-
charBytes.Add((byte)octet);
2221+
charBytes[nextInsertionIndex++] = (byte)octet;
22142222

22152223
Skip();
22162224
Skip();
22172225
Skip();
22182226
}
22192227
while (--width > 0);
22202228

2221-
var characters = Encoding.UTF8.GetChars(charBytes.ToArray());
2229+
var result = Encoding.UTF8.GetString(charBytes, 0, nextInsertionIndex);
22222230

2223-
if (characters.Length != 1)
2231+
if (result.Length == 0 || result.Length > 2)
22242232
{
22252233
throw new SyntaxErrorException(start, cursor.Mark(), "While parsing a tag, find an incorrect UTF-8 sequence.");
22262234
}
22272235

2228-
return characters[0];
2236+
return result;
22292237
}
22302238

22312239
/// <summary>

0 commit comments

Comments
 (0)