1919namespace workerd ::api {
2020
2121namespace {
22- constexpr char16_t REPLACEMENT_CHAR = 0xFFFD ;
2322constexpr kj::byte REPLACEMENT_UTF8[] = {0xEF , 0xBF , 0xBD };
2423
2524struct Holder : public kj ::Refcounted {
@@ -30,38 +29,15 @@ struct Holder: public kj::Refcounted {
3029// TextEncoderStream encodes a stream of JavaScript strings into UTF-8 bytes.
3130//
3231// WHATWG Encoding spec requirement (https://encoding.spec.whatwg.org/#interface-textencoderstream):
33- // The encoder must handle surrogate pairs that may be split across chunk boundaries.
34- // This is tested by WPT's "encoding/streams/encode-utf8.any.js" which includes:
35- // - "a character split between chunks should be correctly encoded" test
36- // - Input: ["\uD83D", "\uDC99"] (U+1F499 💙 split into high/low surrogate chunks)
37- // - Expected output: [0xf0, 0x9f, 0x92, 0x99] (U+1F499 encoded as UTF-8)
32+ // The encoder must encode unpaired UTF-16 surrogates as replacement characters.
3833//
39- // The main complexity is handling UTF-16 surrogate pairs that may be split across chunks:
40- // - JavaScript strings use UTF-16 encoding internally
41- // - A surrogate pair consists of a high surrogate (0xD800-0xDBFF) followed by a low surrogate
42- // (0xDC00-0xDFFF), representing code points above U+FFFF (e.g., emoji, rare CJK characters)
43- // - If a chunk ends with a high surrogate, we must wait for the next chunk to see if it starts
44- // with a matching low surrogate before encoding
45- // - If no match arrives (chunk starts with non-low-surrogate, or stream ends), the orphaned
46- // high surrogate is replaced with U+FFFD (replacement character)
34+ // simdutf handles this for us, but we have to be careful of surrogate pairs
35+ // (high surrogate, followed by low surrogate) split across chunk boundaries.
4736//
48- // State machine :
37+ // We do this with the pending field :
4938// holder->pending = kj::none -> No pending high surrogate from previous chunk
5039// holder->pending = char16_t -> High surrogate waiting for a matching low surrogate
5140//
52- // Transform algorithm for each chunk:
53- // 1. Allocate buffer with prefix slot if we have a pending surrogate
54- // 2. Write the chunk's UTF-16 code units into the buffer (after the prefix slot)
55- // 3. If pending exists:
56- // - If chunk starts with low surrogate -> complete the pair (buf[0] = pending lead)
57- // - Otherwise -> replace pending with U+FFFD (buf[0] = REPLACEMENT_CHAR)
58- // 4. If chunk ends with high surrogate -> save it as pending, exclude from output
59- // 5. Sanitize remaining surrogates with simdutf::to_well_formed_utf16
60- // 6. Convert to UTF-8 and enqueue
61- //
62- // Flush algorithm (when stream closes):
63- // - If pending high surrogate exists -> emit U+FFFD (3 UTF-8 bytes: 0xEF 0xBF 0xBD)
64- //
6541// Ref: https://github.com/web-platform-tests/wpt/blob/master/encoding/streams/encode-utf8.any.js
6642jsg::Ref<TextEncoderStream> TextEncoderStream::constructor (jsg::Lock& js) {
6743 auto state = kj::rc<Holder>();
@@ -70,45 +46,34 @@ jsg::Ref<TextEncoderStream> TextEncoderStream::constructor(jsg::Lock& js) {
7046 jsg::Ref<TransformStreamDefaultController> controller) mutable {
7147 auto str = jsg::check (chunk->ToString (js.v8Context ()));
7248 size_t length = str->Length ();
73-
74- // Early exit: empty chunk with no pending surrogate produces no output
75- if (length == 0 && holder->pending == kj::none) return js.resolvedPromise ();
49+ if (length == 0 ) return js.resolvedPromise ();
7650
7751 // Allocate buffer: reserve slot 0 for pending surrogate if we have one
78- size_t prefix = (holder->pending != kj::none) ? 1 : 0 ;
79- auto buf = kj::heapArray<char16_t >(prefix + length);
52+ size_t prefix = (holder->pending == kj::none) ? 0 : 1 ;
53+ size_t end = prefix + length;
54+ auto buf = kj::heapArray<char16_t >(end);
8055 str->WriteV2 (js.v8Isolate , 0 , length, reinterpret_cast <uint16_t *>(buf.begin () + prefix));
8156
82- // Handle pending high surrogate from previous chunk
8357 KJ_IF_SOME (lead, holder->pending ) {
84- KJ_DASSERT (U_IS_LEAD (lead), " pending must be a high surrogate" );
85- // Empty chunk: keep pending surrogate for next chunk
86- if (length == 0 ) return js.resolvedPromise ();
58+ buf.begin ()[0 ] = lead;
8759 holder->pending = kj::none;
88- // If chunk starts with matching low surrogate, complete the pair; otherwise emit U+FFFD
89- buf[0 ] = U_IS_TRAIL (buf[prefix]) ? lead : REPLACEMENT_CHAR;
9060 }
9161
92- size_t end = prefix + length;
93- KJ_DASSERT (end <= buf.size ());
94-
9562 // If chunk ends with high surrogate, save it for next chunk
9663 if (end > 0 && U_IS_LEAD (buf[end - 1 ])) {
9764 holder->pending = buf[--end];
9865 }
99-
100- // Nothing to encode after handling surrogates
10166 if (end == 0 ) return js.resolvedPromise ();
10267
10368 auto slice = buf.first (end);
104- KJ_DASSERT (slice.size () > 0 );
10569 auto result = simdutf::utf8_length_from_utf16_with_replacement (slice.begin (), slice.size ());
106- // Only sanitize if there are unpaired surrogates in the middle of the buffer
70+ // Only sanitize if there are surrogates in the buffer - UTF-16 without
71+ // surrogates is always well-formed.
10772 if (result.error == simdutf::error_code::SURROGATE) {
10873 simdutf::to_well_formed_utf16 (slice.begin (), slice.size (), slice.begin ());
10974 }
11075 auto utf8Length = result.count ;
111- KJ_DASSERT (utf8Length > 0 );
76+ KJ_DASSERT (utf8Length > 0 && utf8Length >= end );
11277
11378 auto backingStore = js.allocBackingStore (utf8Length, jsg::Lock::AllocOption::UNINITIALIZED);
11479 auto dest = kj::ArrayPtr<char >(static_cast <char *>(backingStore->Data ()), utf8Length);
0 commit comments