Fix TextEncoderStream surrogate pair handling across chunks

anonrig · anonrig · commit 508506e66965 · 2025-12-12T17:56:59.000-05:00
diff --git a/src/workerd/api/streams/encoding.c++ b/src/workerd/api/streams/encoding.c++
@@ -4,35 +4,136 @@
 
 #include "encoding.h"
 
+#include "simdutf.h"
+
 #include <workerd/api/encoding.h>
 #include <workerd/api/streams/standard.h>
 #include <workerd/io/features.h>
 #include <workerd/jsg/jsg.h>
 
+#include <v8.h>
+
+#include <kj/common.h>
+#include <kj/refcount.h>
+
 namespace workerd::api {
 
+namespace {
+constexpr char16_t REPLACEMENT_CHAR = 0xFFFD;
+constexpr kj::byte REPLACEMENT_UTF8[] = {0xEF, 0xBF, 0xBD};
+
+struct Holder: public kj::Refcounted {
+  kj::Maybe<char16_t> pending = kj::none;
+};
+}  // namespace
+
+// TextEncoderStream encodes a stream of JavaScript strings into UTF-8 bytes.
+//
+// WHATWG Encoding spec requirement (https://encoding.spec.whatwg.org/#interface-textencoderstream):
+// The encoder must handle surrogate pairs that may be split across chunk boundaries.
+// This is tested by WPT's "encoding/streams/encode-utf8.any.js" which includes:
+//   - "a character split between chunks should be correctly encoded" test
+//   - Input: ["\uD83D", "\uDC99"] (U+1F499 💙 split into high/low surrogate chunks)
+//   - Expected output: [0xf0, 0x9f, 0x92, 0x99] (U+1F499 encoded as UTF-8)
+//
+// The main complexity is handling UTF-16 surrogate pairs that may be split across chunks:
+// - JavaScript strings use UTF-16 encoding internally
+// - A surrogate pair consists of a high surrogate (0xD800-0xDBFF) followed by a low surrogate
+//   (0xDC00-0xDFFF), representing code points above U+FFFF (e.g., emoji, rare CJK characters)
+// - If a chunk ends with a high surrogate, we must wait for the next chunk to see if it starts
+//   with a matching low surrogate before encoding
+// - If no match arrives (chunk starts with non-low-surrogate, or stream ends), the orphaned
+//   high surrogate is replaced with U+FFFD (replacement character)
+//
+// State machine:
+//   holder->pending = kj::none    -> No pending high surrogate from previous chunk
+//   holder->pending = char16_t    -> High surrogate waiting for a matching low surrogate
+//
+// Transform algorithm for each chunk:
+//   1. Allocate buffer with prefix slot if we have a pending surrogate
+//   2. Write the chunk's UTF-16 code units into the buffer (after the prefix slot)
+//   3. If pending exists:
+//      - If chunk starts with low surrogate -> complete the pair (buf[0] = pending lead)
+//      - Otherwise -> replace pending with U+FFFD (buf[0] = REPLACEMENT_CHAR)
+//   4. If chunk ends with high surrogate -> save it as pending, exclude from output
+//   5. Sanitize remaining surrogates with simdutf::to_well_formed_utf16
+//   6. Convert to UTF-8 and enqueue
+//
+// Flush algorithm (when stream closes):
+//   - If pending high surrogate exists -> emit U+FFFD (3 UTF-8 bytes: 0xEF 0xBF 0xBD)
+//
+// Ref: https://github.com/web-platform-tests/wpt/blob/master/encoding/streams/encode-utf8.any.js
 jsg::Ref<TextEncoderStream> TextEncoderStream::constructor(jsg::Lock& js) {
+  auto state = kj::rc<Holder>();
+
   auto transformer = TransformStream::constructor(js,
       Transformer{.transform = jsg::Function<Transformer::TransformAlgorithm>(
-                      [](jsg::Lock& js, auto chunk, auto controller) {
+                      [holder = state.addRef()](jsg::Lock& js, v8::Local<v8::Value> chunk,
+                          jsg::Ref<TransformStreamDefaultController> controller) mutable {
     auto str = jsg::check(chunk->ToString(js.v8Context()));
-    auto utf8Length = str->Utf8LengthV2(js.v8Isolate);
-
-    // Don't emit empty chunks
-    if (utf8Length == 0) {
-      return js.resolvedPromise();
+    size_t length = str->Length();
+
+    // Early exit: empty chunk with no pending surrogate produces no output
+    if (length == 0 && holder->pending == kj::none) return js.resolvedPromise();
+
+    // Allocate buffer: reserve slot 0 for pending surrogate if we have one
+    size_t prefix = (holder->pending != kj::none) ? 1 : 0;
+    auto buf = kj::heapArray<char16_t>(prefix + length);
+    str->WriteV2(js.v8Isolate, 0, length, reinterpret_cast<uint16_t*>(buf.begin() + prefix));
+
+    // Handle pending high surrogate from previous chunk
+    KJ_IF_SOME(lead, holder->pending) {
+      KJ_DASSERT(U_IS_LEAD(lead), "pending must be a high surrogate");
+      // Empty chunk: keep pending surrogate for next chunk
+      if (length == 0) return js.resolvedPromise();
+      holder->pending = kj::none;
+      // If chunk starts with matching low surrogate, complete the pair; otherwise emit U+FFFD
+      buf[0] = U_IS_TRAIL(buf[prefix]) ? lead : REPLACEMENT_CHAR;
     }
 
-    v8::Local<v8::ArrayBuffer> buffer;
-    JSG_REQUIRE(v8::ArrayBuffer::MaybeNew(js.v8Isolate, utf8Length).ToLocal(&buffer), RangeError,
-        "Cannot allocate space for TextEncoder.encode");
+    size_t end = prefix + length;
+    KJ_DASSERT(end <= buf.size());
 
-    auto bytes = jsg::asBytes(buffer).releaseAsChars();
-    [[maybe_unused]] auto written = str->WriteUtf8V2(
-        js.v8Isolate, bytes.begin(), bytes.size(), v8::String::WriteFlags::kReplaceInvalidUtf8);
+    // If chunk ends with high surrogate, save it for next chunk
+    if (end > 0 && U_IS_LEAD(buf[end - 1])) {
+      holder->pending = buf[--end];
+    }
 
-    KJ_DASSERT(written == buffer->ByteLength());
-    controller->enqueue(js, v8::Uint8Array::New(buffer, 0, buffer->ByteLength()));
+    // Nothing to encode after handling surrogates
+    if (end == 0) return js.resolvedPromise();
+
+    auto slice = buf.first(end);
+    KJ_DASSERT(slice.size() > 0);
+    auto result = simdutf::utf8_length_from_utf16_with_replacement(slice.begin(), slice.size());
+    // Only sanitize if there are unpaired surrogates in the middle of the buffer
+    if (result.error == simdutf::error_code::SURROGATE) {
+      simdutf::to_well_formed_utf16(slice.begin(), slice.size(), slice.begin());
+    }
+    auto utf8Length = result.count;
+    KJ_DASSERT(utf8Length > 0);
+
+    auto backingStore = js.allocBackingStore(utf8Length, jsg::Lock::AllocOption::UNINITIALIZED);
+    auto dest = kj::ArrayPtr<char>(static_cast<char*>(backingStore->Data()), utf8Length);
+    [[maybe_unused]] auto written =
+        simdutf::convert_utf16_to_utf8(slice.begin(), slice.size(), dest.begin());
+    KJ_DASSERT(written == utf8Length, "simdutf should write exactly utf8Length bytes");
+
+    auto array = v8::Uint8Array::New(
+        v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8Length);
+    controller->enqueue(js, jsg::JsUint8Array(array));
+    return js.resolvedPromise();
+  }),
+        .flush = jsg::Function<Transformer::FlushAlgorithm>(
+            [holder = state.addRef()](
+                jsg::Lock& js, jsg::Ref<TransformStreamDefaultController> controller) mutable {
+    // If stream ends with orphaned high surrogate, emit replacement character
+    if (holder->pending != kj::none) {
+      auto backingStore = js.allocBackingStore(3, jsg::Lock::AllocOption::UNINITIALIZED);
+      memcpy(backingStore->Data(), REPLACEMENT_UTF8, 3);
+      auto array =
+          v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, 3);
+      controller->enqueue(js, jsg::JsUint8Array(array));
+    }
     return js.resolvedPromise();
   })},
       StreamQueuingStrategy{}, StreamQueuingStrategy{});
diff --git a/src/wpt/encoding-test.ts b/src/wpt/encoding-test.ts
@@ -110,20 +110,7 @@ export default {
     ],
   },
   'streams/encode-bad-chunks.any.js': {},
-  'streams/encode-utf8.any.js': {
-    comment: 'Surrogate pair handling across chunks not yet implemented',
-    expectedFailures: [
-      'a character split between chunks should be correctly encoded',
-      'a character following one split between chunks should be correctly encoded',
-      'an unmatched surrogate at the end of a chunk followed by an astral character in the next chunk should be replaced with the replacement character at the start of the next output chunk',
-      'an unmatched surrogate at the end of a chunk followed by an ascii character in the next chunk should be replaced with the replacement character at the start of the next output chunk',
-      'a non-terminal unpaired leading surrogate should immediately be replaced',
-      'two consecutive astral characters each split down the middle should be correctly reassembled',
-      'two consecutive astral characters each split down the middle with an invalid surrogate in the middle should be correctly encoded',
-      'an unmatched surrogate at the end of a chunk followed by a plane 1 character split into two chunks should result in the encoded plane 1 character appearing in the last output chunk',
-      'a leading surrogate chunk should be carried past empty chunks',
-    ],
-  },
+  'streams/encode-utf8.any.js': {},
   'streams/invalid-realm.window.js': {
     comment: 'Enable when ShadowRealm is supported',
     expectedFailures: [