Skip to content

Commit 6c26d5c

Browse files
committed
Fix JSON serialization for UTF-32 characters.
When serializing the data in JSON-compatible form, 4-byte UTF32 characters need to be split into two 2-byte code points.. This change fixes that by introducing new emitter setting `UseUtf16SurrogatePairs`, which is set when JSON-compatible builder is requested.
1 parent 7923dd8 commit 6c26d5c

File tree

4 files changed

+58
-4
lines changed

4 files changed

+58
-4
lines changed

YamlDotNet.Test/Serialization/SerializationTests.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,15 @@ public void SerializationOfAnchorWorksInJson()
886886
.BeEquivalentTo(@"{""x"": {""z"": {""v"": ""1""}}, ""y"": {""k"": {""z"": {""v"": ""1""}}}}");
887887
}
888888

889+
[Fact]
890+
public void SerializationOfUtf32WorksInJson()
891+
{
892+
var obj = new { TestProperty = "Sea life \U0001F99E" };
893+
894+
SerializerBuilder.JsonCompatible().Build().Serialize(obj).Trim().Should()
895+
.Be(@"{""TestProperty"": ""Sea life \uD83E\uDD9E""}");
896+
}
897+
889898
[Fact]
890899
// Todo: this is actually roundtrip
891900
public void DeserializationOfDefaultsWorkInJson()

YamlDotNet/Core/Emitter.cs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ public class Emitter : IEmitter
6666
private bool isWhitespace;
6767
private bool isIndentation;
6868
private readonly bool forceIndentLess;
69+
private readonly bool useUtf16SurrogatePair;
6970
private readonly string newLine;
7071

7172
private bool isDocumentEndWritten;
@@ -148,6 +149,7 @@ public Emitter(TextWriter output, EmitterSettings settings)
148149
this.maxSimpleKeyLength = settings.MaxSimpleKeyLength;
149150
this.skipAnchorName = settings.SkipAnchorName;
150151
this.forceIndentLess = !settings.IndentSequences;
152+
this.useUtf16SurrogatePair = settings.UseUtf16SurrogatePairs;
151153
this.newLine = settings.NewLine;
152154

153155
this.output = output;
@@ -1189,8 +1191,20 @@ private void WriteDoubleQuotedScalar(string value, bool allowBreaks)
11891191
{
11901192
if (index + 1 < value.Length && IsLowSurrogate(value[index + 1]))
11911193
{
1192-
Write('U');
1193-
Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
1194+
if (useUtf16SurrogatePair)
1195+
{
1196+
Write('u');
1197+
Write(code.ToString("X04", CultureInfo.InvariantCulture));
1198+
Write('\\');
1199+
Write('u');
1200+
Write(((ushort)value[index + 1]).ToString("X04", CultureInfo.InvariantCulture));
1201+
}
1202+
else
1203+
{
1204+
Write('U');
1205+
Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
1206+
}
1207+
11941208
index++;
11951209
}
11961210
else

YamlDotNet/Core/EmitterSettings.cs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,22 @@ public sealed class EmitterSettings
6363
/// </summary>
6464
public bool IndentSequences { get; }
6565

66+
/// <summary>
67+
/// If true, then 4-byte UTF-32 characters are broken into two 2-byte code-points.
68+
/// </summary>
69+
/// <remarks>
70+
/// This ensures compatibility with JSON format, as it does not allow '\Uxxxxxxxxx'
71+
/// and instead expects two escaped 2-byte character '\uxxxx\uxxxx'.
72+
/// </remarks>
73+
public bool UseUtf16SurrogatePairs { get; }
74+
6675
public static readonly EmitterSettings Default = new EmitterSettings();
6776

6877
public EmitterSettings()
6978
{
7079
}
7180

72-
public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, string? newLine = null)
81+
public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, bool useUtf16SurrogatePairs = false, string? newLine = null)
7382
{
7483
if (bestIndent < 2 || bestIndent > 9)
7584
{
@@ -92,6 +101,7 @@ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxS
92101
MaxSimpleKeyLength = maxSimpleKeyLength;
93102
SkipAnchorName = skipAnchorName;
94103
IndentSequences = indentSequences;
104+
UseUtf16SurrogatePairs = useUtf16SurrogatePairs;
95105
NewLine = newLine ?? Environment.NewLine;
96106
}
97107

@@ -104,6 +114,7 @@ public EmitterSettings WithBestIndent(int bestIndent)
104114
MaxSimpleKeyLength,
105115
SkipAnchorName,
106116
IndentSequences,
117+
UseUtf16SurrogatePairs,
107118
NewLine
108119
);
109120
}
@@ -117,6 +128,7 @@ public EmitterSettings WithBestWidth(int bestWidth)
117128
MaxSimpleKeyLength,
118129
SkipAnchorName,
119130
IndentSequences,
131+
UseUtf16SurrogatePairs,
120132
NewLine
121133
);
122134
}
@@ -130,6 +142,7 @@ public EmitterSettings WithMaxSimpleKeyLength(int maxSimpleKeyLength)
130142
maxSimpleKeyLength,
131143
SkipAnchorName,
132144
IndentSequences,
145+
UseUtf16SurrogatePairs,
133146
NewLine
134147
);
135148
}
@@ -143,6 +156,7 @@ public EmitterSettings WithNewLine(string newLine)
143156
MaxSimpleKeyLength,
144157
SkipAnchorName,
145158
IndentSequences,
159+
UseUtf16SurrogatePairs,
146160
newLine
147161
);
148162
}
@@ -167,6 +181,7 @@ public EmitterSettings WithoutAnchorName()
167181
MaxSimpleKeyLength,
168182
true,
169183
IndentSequences,
184+
UseUtf16SurrogatePairs,
170185
NewLine
171186
);
172187
}
@@ -180,6 +195,21 @@ public EmitterSettings WithIndentedSequences()
180195
MaxSimpleKeyLength,
181196
SkipAnchorName,
182197
true,
198+
UseUtf16SurrogatePairs,
199+
NewLine
200+
);
201+
}
202+
203+
public EmitterSettings WithUtf16SurrogatePairs()
204+
{
205+
return new EmitterSettings(
206+
BestIndent,
207+
BestWidth,
208+
IsCanonical,
209+
MaxSimpleKeyLength,
210+
SkipAnchorName,
211+
IndentSequences,
212+
true,
183213
NewLine
184214
);
185215
}

YamlDotNet/Serialization/SerializerBuilder.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,8 @@ public SerializerBuilder JsonCompatible()
366366
{
367367
this.emitterSettings = this.emitterSettings
368368
.WithMaxSimpleKeyLength(int.MaxValue)
369-
.WithoutAnchorName();
369+
.WithoutAnchorName()
370+
.WithUtf16SurrogatePairs();
370371

371372
return this
372373
.WithTypeConverter(new GuidConverter(true), w => w.InsteadOf<GuidConverter>())

0 commit comments

Comments
 (0)