Skip to content

Commit 508506e

Browse files
committed
Fix TextEncoderStream surrogate pair handling across chunks
1 parent f01bb10 commit 508506e

File tree

2 files changed

+116
-28
lines changed

2 files changed

+116
-28
lines changed

src/workerd/api/streams/encoding.c++

Lines changed: 115 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,136 @@
44

55
#include "encoding.h"
66

7+
#include "simdutf.h"
8+
79
#include <workerd/api/encoding.h>
810
#include <workerd/api/streams/standard.h>
911
#include <workerd/io/features.h>
1012
#include <workerd/jsg/jsg.h>
1113

14+
#include <v8.h>
15+
16+
#include <kj/common.h>
17+
#include <kj/refcount.h>
18+
1219
namespace workerd::api {
1320

21+
namespace {
22+
constexpr char16_t REPLACEMENT_CHAR = 0xFFFD;
23+
constexpr kj::byte REPLACEMENT_UTF8[] = {0xEF, 0xBF, 0xBD};
24+
25+
struct Holder: public kj::Refcounted {
26+
kj::Maybe<char16_t> pending = kj::none;
27+
};
28+
} // namespace
29+
30+
// TextEncoderStream encodes a stream of JavaScript strings into UTF-8 bytes.
31+
//
32+
// WHATWG Encoding spec requirement (https://encoding.spec.whatwg.org/#interface-textencoderstream):
33+
// The encoder must handle surrogate pairs that may be split across chunk boundaries.
34+
// This is tested by WPT's "encoding/streams/encode-utf8.any.js" which includes:
35+
// - "a character split between chunks should be correctly encoded" test
36+
// - Input: ["\uD83D", "\uDC99"] (U+1F499 💙 split into high/low surrogate chunks)
37+
// - Expected output: [0xf0, 0x9f, 0x92, 0x99] (U+1F499 encoded as UTF-8)
38+
//
39+
// The main complexity is handling UTF-16 surrogate pairs that may be split across chunks:
40+
// - JavaScript strings use UTF-16 encoding internally
41+
// - A surrogate pair consists of a high surrogate (0xD800-0xDBFF) followed by a low surrogate
42+
// (0xDC00-0xDFFF), representing code points above U+FFFF (e.g., emoji, rare CJK characters)
43+
// - If a chunk ends with a high surrogate, we must wait for the next chunk to see if it starts
44+
// with a matching low surrogate before encoding
45+
// - If no match arrives (chunk starts with non-low-surrogate, or stream ends), the orphaned
46+
// high surrogate is replaced with U+FFFD (replacement character)
47+
//
48+
// State machine:
49+
// holder->pending = kj::none -> No pending high surrogate from previous chunk
50+
// holder->pending = char16_t -> High surrogate waiting for a matching low surrogate
51+
//
52+
// Transform algorithm for each chunk:
53+
// 1. Allocate buffer with prefix slot if we have a pending surrogate
54+
// 2. Write the chunk's UTF-16 code units into the buffer (after the prefix slot)
55+
// 3. If pending exists:
56+
// - If chunk starts with low surrogate -> complete the pair (buf[0] = pending lead)
57+
// - Otherwise -> replace pending with U+FFFD (buf[0] = REPLACEMENT_CHAR)
58+
// 4. If chunk ends with high surrogate -> save it as pending, exclude from output
59+
// 5. Sanitize remaining surrogates with simdutf::to_well_formed_utf16
60+
// 6. Convert to UTF-8 and enqueue
61+
//
62+
// Flush algorithm (when stream closes):
63+
// - If pending high surrogate exists -> emit U+FFFD (3 UTF-8 bytes: 0xEF 0xBF 0xBD)
64+
//
65+
// Ref: https://github.com/web-platform-tests/wpt/blob/master/encoding/streams/encode-utf8.any.js
1466
jsg::Ref<TextEncoderStream> TextEncoderStream::constructor(jsg::Lock& js) {
67+
auto state = kj::rc<Holder>();
68+
1569
auto transformer = TransformStream::constructor(js,
1670
Transformer{.transform = jsg::Function<Transformer::TransformAlgorithm>(
17-
[](jsg::Lock& js, auto chunk, auto controller) {
71+
[holder = state.addRef()](jsg::Lock& js, v8::Local<v8::Value> chunk,
72+
jsg::Ref<TransformStreamDefaultController> controller) mutable {
1873
auto str = jsg::check(chunk->ToString(js.v8Context()));
19-
auto utf8Length = str->Utf8LengthV2(js.v8Isolate);
20-
21-
// Don't emit empty chunks
22-
if (utf8Length == 0) {
23-
return js.resolvedPromise();
74+
size_t length = str->Length();
75+
76+
// Early exit: empty chunk with no pending surrogate produces no output
77+
if (length == 0 && holder->pending == kj::none) return js.resolvedPromise();
78+
79+
// Allocate buffer: reserve slot 0 for pending surrogate if we have one
80+
size_t prefix = (holder->pending != kj::none) ? 1 : 0;
81+
auto buf = kj::heapArray<char16_t>(prefix + length);
82+
str->WriteV2(js.v8Isolate, 0, length, reinterpret_cast<uint16_t*>(buf.begin() + prefix));
83+
84+
// Handle pending high surrogate from previous chunk
85+
KJ_IF_SOME(lead, holder->pending) {
86+
KJ_DASSERT(U_IS_LEAD(lead), "pending must be a high surrogate");
87+
// Empty chunk: keep pending surrogate for next chunk
88+
if (length == 0) return js.resolvedPromise();
89+
holder->pending = kj::none;
90+
// If chunk starts with matching low surrogate, complete the pair; otherwise emit U+FFFD
91+
buf[0] = U_IS_TRAIL(buf[prefix]) ? lead : REPLACEMENT_CHAR;
2492
}
2593

26-
v8::Local<v8::ArrayBuffer> buffer;
27-
JSG_REQUIRE(v8::ArrayBuffer::MaybeNew(js.v8Isolate, utf8Length).ToLocal(&buffer), RangeError,
28-
"Cannot allocate space for TextEncoder.encode");
94+
size_t end = prefix + length;
95+
KJ_DASSERT(end <= buf.size());
2996

30-
auto bytes = jsg::asBytes(buffer).releaseAsChars();
31-
[[maybe_unused]] auto written = str->WriteUtf8V2(
32-
js.v8Isolate, bytes.begin(), bytes.size(), v8::String::WriteFlags::kReplaceInvalidUtf8);
97+
// If chunk ends with high surrogate, save it for next chunk
98+
if (end > 0 && U_IS_LEAD(buf[end - 1])) {
99+
holder->pending = buf[--end];
100+
}
33101

34-
KJ_DASSERT(written == buffer->ByteLength());
35-
controller->enqueue(js, v8::Uint8Array::New(buffer, 0, buffer->ByteLength()));
102+
// Nothing to encode after handling surrogates
103+
if (end == 0) return js.resolvedPromise();
104+
105+
auto slice = buf.first(end);
106+
KJ_DASSERT(slice.size() > 0);
107+
auto result = simdutf::utf8_length_from_utf16_with_replacement(slice.begin(), slice.size());
108+
// Only sanitize if there are unpaired surrogates in the middle of the buffer
109+
if (result.error == simdutf::error_code::SURROGATE) {
110+
simdutf::to_well_formed_utf16(slice.begin(), slice.size(), slice.begin());
111+
}
112+
auto utf8Length = result.count;
113+
KJ_DASSERT(utf8Length > 0);
114+
115+
auto backingStore = js.allocBackingStore(utf8Length, jsg::Lock::AllocOption::UNINITIALIZED);
116+
auto dest = kj::ArrayPtr<char>(static_cast<char*>(backingStore->Data()), utf8Length);
117+
[[maybe_unused]] auto written =
118+
simdutf::convert_utf16_to_utf8(slice.begin(), slice.size(), dest.begin());
119+
KJ_DASSERT(written == utf8Length, "simdutf should write exactly utf8Length bytes");
120+
121+
auto array = v8::Uint8Array::New(
122+
v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8Length);
123+
controller->enqueue(js, jsg::JsUint8Array(array));
124+
return js.resolvedPromise();
125+
}),
126+
.flush = jsg::Function<Transformer::FlushAlgorithm>(
127+
[holder = state.addRef()](
128+
jsg::Lock& js, jsg::Ref<TransformStreamDefaultController> controller) mutable {
129+
// If stream ends with orphaned high surrogate, emit replacement character
130+
if (holder->pending != kj::none) {
131+
auto backingStore = js.allocBackingStore(3, jsg::Lock::AllocOption::UNINITIALIZED);
132+
memcpy(backingStore->Data(), REPLACEMENT_UTF8, 3);
133+
auto array =
134+
v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, 3);
135+
controller->enqueue(js, jsg::JsUint8Array(array));
136+
}
36137
return js.resolvedPromise();
37138
})},
38139
StreamQueuingStrategy{}, StreamQueuingStrategy{});

src/wpt/encoding-test.ts

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -110,20 +110,7 @@ export default {
110110
],
111111
},
112112
'streams/encode-bad-chunks.any.js': {},
113-
'streams/encode-utf8.any.js': {
114-
comment: 'Surrogate pair handling across chunks not yet implemented',
115-
expectedFailures: [
116-
'a character split between chunks should be correctly encoded',
117-
'a character following one split between chunks should be correctly encoded',
118-
'an unmatched surrogate at the end of a chunk followed by an astral character in the next chunk should be replaced with the replacement character at the start of the next output chunk',
119-
'an unmatched surrogate at the end of a chunk followed by an ascii character in the next chunk should be replaced with the replacement character at the start of the next output chunk',
120-
'a non-terminal unpaired leading surrogate should immediately be replaced',
121-
'two consecutive astral characters each split down the middle should be correctly reassembled',
122-
'two consecutive astral characters each split down the middle with an invalid surrogate in the middle should be correctly encoded',
123-
'an unmatched surrogate at the end of a chunk followed by a plane 1 character split into two chunks should result in the encoded plane 1 character appearing in the last output chunk',
124-
'a leading surrogate chunk should be carried past empty chunks',
125-
],
126-
},
113+
'streams/encode-utf8.any.js': {},
127114
'streams/invalid-realm.window.js': {
128115
comment: 'Enable when ShadowRealm is supported',
129116
expectedFailures: [

0 commit comments

Comments
 (0)