Skip to content

Commit 59cc2d9

Browse files
authored
Merge pull request #5702 from cloudflare/erikcorry/simplify
2 parents 99423c5 + 159dfdb commit 59cc2d9

File tree

1 file changed

+12
-47
lines changed

1 file changed

+12
-47
lines changed

src/workerd/api/streams/encoding.c++

Lines changed: 12 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
namespace workerd::api {
2020

2121
namespace {
22-
constexpr char16_t REPLACEMENT_CHAR = 0xFFFD;
2322
constexpr kj::byte REPLACEMENT_UTF8[] = {0xEF, 0xBF, 0xBD};
2423

2524
struct Holder: public kj::Refcounted {
@@ -30,38 +29,15 @@ struct Holder: public kj::Refcounted {
3029
// TextEncoderStream encodes a stream of JavaScript strings into UTF-8 bytes.
3130
//
3231
// WHATWG Encoding spec requirement (https://encoding.spec.whatwg.org/#interface-textencoderstream):
33-
// The encoder must handle surrogate pairs that may be split across chunk boundaries.
34-
// This is tested by WPT's "encoding/streams/encode-utf8.any.js" which includes:
35-
// - "a character split between chunks should be correctly encoded" test
36-
// - Input: ["\uD83D", "\uDC99"] (U+1F499 💙 split into high/low surrogate chunks)
37-
// - Expected output: [0xf0, 0x9f, 0x92, 0x99] (U+1F499 encoded as UTF-8)
32+
// The encoder must encode unpaired UTF-16 surrogates as replacement characters.
3833
//
39-
// The main complexity is handling UTF-16 surrogate pairs that may be split across chunks:
40-
// - JavaScript strings use UTF-16 encoding internally
41-
// - A surrogate pair consists of a high surrogate (0xD800-0xDBFF) followed by a low surrogate
42-
// (0xDC00-0xDFFF), representing code points above U+FFFF (e.g., emoji, rare CJK characters)
43-
// - If a chunk ends with a high surrogate, we must wait for the next chunk to see if it starts
44-
// with a matching low surrogate before encoding
45-
// - If no match arrives (chunk starts with non-low-surrogate, or stream ends), the orphaned
46-
// high surrogate is replaced with U+FFFD (replacement character)
34+
// simdutf handles this for us, but we have to be careful of surrogate pairs
35+
// (high surrogate, followed by low surrogate) split across chunk boundaries.
4736
//
48-
// State machine:
37+
// We do this with the pending field:
4938
// holder->pending = kj::none -> No pending high surrogate from previous chunk
5039
// holder->pending = char16_t -> High surrogate waiting for a matching low surrogate
5140
//
52-
// Transform algorithm for each chunk:
53-
// 1. Allocate buffer with prefix slot if we have a pending surrogate
54-
// 2. Write the chunk's UTF-16 code units into the buffer (after the prefix slot)
55-
// 3. If pending exists:
56-
// - If chunk starts with low surrogate -> complete the pair (buf[0] = pending lead)
57-
// - Otherwise -> replace pending with U+FFFD (buf[0] = REPLACEMENT_CHAR)
58-
// 4. If chunk ends with high surrogate -> save it as pending, exclude from output
59-
// 5. Sanitize remaining surrogates with simdutf::to_well_formed_utf16
60-
// 6. Convert to UTF-8 and enqueue
61-
//
62-
// Flush algorithm (when stream closes):
63-
// - If pending high surrogate exists -> emit U+FFFD (3 UTF-8 bytes: 0xEF 0xBF 0xBD)
64-
//
6541
// Ref: https://github.com/web-platform-tests/wpt/blob/master/encoding/streams/encode-utf8.any.js
6642
jsg::Ref<TextEncoderStream> TextEncoderStream::constructor(jsg::Lock& js) {
6743
auto state = kj::rc<Holder>();
@@ -70,45 +46,34 @@ jsg::Ref<TextEncoderStream> TextEncoderStream::constructor(jsg::Lock& js) {
7046
jsg::Ref<TransformStreamDefaultController> controller) mutable {
7147
auto str = jsg::check(chunk->ToString(js.v8Context()));
7248
size_t length = str->Length();
73-
74-
// Early exit: empty chunk with no pending surrogate produces no output
75-
if (length == 0 && holder->pending == kj::none) return js.resolvedPromise();
49+
if (length == 0) return js.resolvedPromise();
7650

7751
// Allocate buffer: reserve slot 0 for pending surrogate if we have one
78-
size_t prefix = (holder->pending != kj::none) ? 1 : 0;
79-
auto buf = kj::heapArray<char16_t>(prefix + length);
52+
size_t prefix = (holder->pending == kj::none) ? 0 : 1;
53+
size_t end = prefix + length;
54+
auto buf = kj::heapArray<char16_t>(end);
8055
str->WriteV2(js.v8Isolate, 0, length, reinterpret_cast<uint16_t*>(buf.begin() + prefix));
8156

82-
// Handle pending high surrogate from previous chunk
8357
KJ_IF_SOME(lead, holder->pending) {
84-
KJ_DASSERT(U_IS_LEAD(lead), "pending must be a high surrogate");
85-
// Empty chunk: keep pending surrogate for next chunk
86-
if (length == 0) return js.resolvedPromise();
58+
buf.begin()[0] = lead;
8759
holder->pending = kj::none;
88-
// If chunk starts with matching low surrogate, complete the pair; otherwise emit U+FFFD
89-
buf[0] = U_IS_TRAIL(buf[prefix]) ? lead : REPLACEMENT_CHAR;
9060
}
9161

92-
size_t end = prefix + length;
93-
KJ_DASSERT(end <= buf.size());
94-
9562
// If chunk ends with high surrogate, save it for next chunk
9663
if (end > 0 && U_IS_LEAD(buf[end - 1])) {
9764
holder->pending = buf[--end];
9865
}
99-
100-
// Nothing to encode after handling surrogates
10166
if (end == 0) return js.resolvedPromise();
10267

10368
auto slice = buf.first(end);
104-
KJ_DASSERT(slice.size() > 0);
10569
auto result = simdutf::utf8_length_from_utf16_with_replacement(slice.begin(), slice.size());
106-
// Only sanitize if there are unpaired surrogates in the middle of the buffer
70+
// Only sanitize if there are surrogates in the buffer - UTF-16 without
71+
// surrogates is always well-formed.
10772
if (result.error == simdutf::error_code::SURROGATE) {
10873
simdutf::to_well_formed_utf16(slice.begin(), slice.size(), slice.begin());
10974
}
11075
auto utf8Length = result.count;
111-
KJ_DASSERT(utf8Length > 0);
76+
KJ_DASSERT(utf8Length > 0 && utf8Length >= end);
11277

11378
auto backingStore = js.allocBackingStore(utf8Length, jsg::Lock::AllocOption::UNINITIALIZED);
11479
auto dest = kj::ArrayPtr<char>(static_cast<char*>(backingStore->Data()), utf8Length);

0 commit comments

Comments
 (0)