Skip to content

Commit e0c94f8

Browse files
Improve XmlDictionaryWriter UTF8 encoding performance (#73336)
* Speed up text encoding * Update implementation * Add tests for binary xml strings * limit counting code to 256 bit vectors * reword comment * rename test * move bytesmax * Fix bytesMax after moving variable initialization * use unicode escape value in test * fix test typo "*" -> "+" * Update src/libraries/System.Private.DataContractSerialization/src/System/Xml/XmlStreamNodeWriter.cs Co-authored-by: Stephen Toub <stoub@microsoft.com> * Remvoe vectorized code from UnsafeGetUTF8Length * Fix overfload * use for loop which seems faster * remove vector loop * make sealed encoding to allow devirtualisation * back some changes * use uint for UnsafeGetUTF8Chars comparison * revert more changes * Fix cutoff based on new measurements * use BinaryPrimitives.ReverseEndianness as suggested * Update cutoff from 24 to 32 chars before calling, due to regression for text based DataContractSerializer * Remove sealed encoding since it only improves XmlConvert --------- Co-authored-by: Stephen Toub <stoub@microsoft.com>
1 parent b54d6ef commit e0c94f8

File tree

2 files changed

+93
-41
lines changed

2 files changed

+93
-41
lines changed

src/libraries/System.Private.DataContractSerialization/src/System/Xml/XmlStreamNodeWriter.cs

Lines changed: 28 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4+
using System.Buffers.Binary;
45
using System.IO;
56
using System.Text;
7+
using System.Runtime.InteropServices;
68
using System.Runtime.Serialization;
79
using System.Threading.Tasks;
810
using System.Diagnostics;
@@ -330,34 +332,26 @@ protected unsafe void UnsafeWriteUnicodeChars(char* chars, int charCount)
330332
}
331333
}
332334

333-
protected unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
335+
protected static unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
334336
{
335-
char* charsMax = chars + charCount;
336-
while (chars < charsMax)
337+
if (BitConverter.IsLittleEndian)
337338
{
338-
char value = *chars++;
339-
buffer[offset++] = (byte)value;
340-
value >>= 8;
341-
buffer[offset++] = (byte)value;
339+
new ReadOnlySpan<char>(chars, charCount)
340+
.CopyTo(MemoryMarshal.Cast<byte, char>(buffer.AsSpan(offset)));
342341
}
342+
else
343+
{
344+
BinaryPrimitives.ReverseEndianness(new ReadOnlySpan<short>(chars, charCount),
345+
MemoryMarshal.Cast<byte, short>(buffer.AsSpan(offset)));
346+
}
347+
343348
return charCount * 2;
344349
}
345350

346351
protected unsafe int UnsafeGetUTF8Length(char* chars, int charCount)
347352
{
348-
char* charsMax = chars + charCount;
349-
while (chars < charsMax)
350-
{
351-
if (*chars >= 0x80)
352-
break;
353-
354-
chars++;
355-
}
356-
357-
if (chars == charsMax)
358-
return charCount;
359-
360-
return (int)(chars - (charsMax - charCount)) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, (int)(charsMax - chars));
353+
// Length will always be at least ( 128 / maxBytesPerChar) = 42
354+
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, charCount);
361355
}
362356

363357
protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffer, int offset)
@@ -366,39 +360,32 @@ protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffe
366360
{
367361
fixed (byte* _bytes = &buffer[offset])
368362
{
369-
byte* bytes = _bytes;
370-
byte* bytesMax = &bytes[buffer.Length - offset];
371-
char* charsMax = &chars[charCount];
372-
373-
while (true)
363+
// Fast path for small strings, use Encoding.GetBytes for larger strings since it is faster when vectorization is possible
364+
if ((uint)charCount < 32)
374365
{
366+
byte* bytes = _bytes;
367+
char* charsMax = &chars[charCount];
368+
375369
while (chars < charsMax)
376370
{
377371
char t = *chars;
378372
if (t >= 0x80)
379-
break;
373+
goto NonAscii;
380374

381375
*bytes = (byte)t;
382376
bytes++;
383377
chars++;
384378
}
379+
return charCount;
385380

386-
if (chars >= charsMax)
387-
break;
388-
389-
char* charsStart = chars;
390-
while (chars < charsMax && *chars >= 0x80)
391-
{
392-
chars++;
393-
}
394-
395-
bytes += (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(charsStart, (int)(chars - charsStart), bytes, (int)(bytesMax - bytes));
396-
397-
if (chars >= charsMax)
398-
break;
381+
NonAscii:
382+
byte* bytesMax = _bytes + buffer.Length - offset;
383+
return (int)(bytes - _bytes) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, (int)(charsMax - chars), bytes, (int)(bytesMax - bytes));
384+
}
385+
else
386+
{
387+
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, charCount, _bytes, buffer.Length - offset);
399388
}
400-
401-
return (int)(bytes - _bytes);
402389
}
403390
}
404391
return 0;

src/libraries/System.Runtime.Serialization.Xml/tests/XmlDictionaryWriterTest.cs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,71 @@ void AssertBytesWritten(Action<XmlDictionaryWriter> action, XmlBinaryNodeType no
494494
}
495495
}
496496

497+
[Fact]
498+
public static void XmlBaseWriter_WriteString()
499+
{
500+
const byte Chars8Text = 152;
501+
const byte Chars16Text = 154;
502+
MemoryStream ms = new MemoryStream();
503+
XmlDictionaryWriter writer = (XmlDictionaryWriter)XmlDictionaryWriter.CreateBinaryWriter(ms);
504+
writer.WriteStartElement("root");
505+
506+
int[] lengths = new[] { 7, 8, 9, 15, 16, 17, 31, 32, 36, 258 };
507+
byte[] buffer = new byte[lengths.Max() + 1];
508+
509+
foreach (var length in lengths)
510+
{
511+
string allAscii = string.Create(length, null, (Span<char> chars, object _) =>
512+
{
513+
for (int i = 0; i < chars.Length; ++i)
514+
chars[i] = (char)(i % 128);
515+
});
516+
string multiByteLast = string.Create(length, null, (Span<char> chars, object _) =>
517+
{
518+
for (int i = 0; i < chars.Length; ++i)
519+
chars[i] = (char)(i % 128);
520+
chars[^1] = '\u00E4'; // 'ä' - Latin Small Letter a with Diaeresis. Latin-1 Supplement.
521+
});
522+
523+
int numBytes = Encoding.UTF8.GetBytes(allAscii, buffer);
524+
Assert.True(numBytes == length, "Test setup wrong - allAscii");
525+
ValidateWriteText(ms, writer, allAscii, expected: buffer.AsSpan(0, numBytes));
526+
527+
numBytes = Encoding.UTF8.GetBytes(multiByteLast, buffer);
528+
Assert.True(numBytes == length + 1, "Test setup wrong - multiByte");
529+
ValidateWriteText(ms, writer, multiByteLast, expected: buffer.AsSpan(0, numBytes));
530+
}
531+
532+
static void ValidateWriteText(MemoryStream ms, XmlDictionaryWriter writer, string text, ReadOnlySpan<byte> expected)
533+
{
534+
writer.Flush();
535+
ms.Seek(0, SeekOrigin.Begin);
536+
ms.SetLength(0);
537+
writer.WriteString(text);
538+
writer.Flush();
539+
540+
ms.TryGetBuffer(out ArraySegment<byte> arraySegment);
541+
ReadOnlySpan<byte> buffer = arraySegment;
542+
543+
if (expected.Length <= byte.MaxValue)
544+
{
545+
Assert.Equal(Chars8Text, buffer[0]);
546+
Assert.Equal(expected.Length, buffer[1]);
547+
buffer = buffer.Slice(2);
548+
}
549+
else if (expected.Length <= ushort.MaxValue)
550+
{
551+
Assert.Equal(Chars16Text, buffer[0]);
552+
Assert.Equal(expected.Length, (int)(buffer[1]) | ((int)buffer[2] << 8));
553+
buffer = buffer.Slice(3);
554+
}
555+
else
556+
Assert.Fail("test use to long length");
557+
558+
AssertExtensions.SequenceEqual(expected, buffer);
559+
}
560+
}
561+
497562
private static bool ReadTest(MemoryStream ms, Encoding encoding, ReaderWriterFactory.ReaderWriterType rwType, byte[] byteArray)
498563
{
499564
ms.Position = 0;

0 commit comments

Comments
 (0)