|
4 | 4 |
|
5 | 5 | #include "encoding.h" |
6 | 6 |
|
| 7 | +#include "simdutf.h" |
| 8 | + |
7 | 9 | #include <workerd/api/encoding.h> |
8 | 10 | #include <workerd/api/streams/standard.h> |
9 | 11 | #include <workerd/io/features.h> |
10 | 12 | #include <workerd/jsg/jsg.h> |
11 | 13 |
|
| 14 | +#include <v8.h> |
| 15 | + |
| 16 | +#include <kj/common.h> |
| 17 | +#include <kj/refcount.h> |
| 18 | + |
12 | 19 | namespace workerd::api { |
13 | 20 |
|
| 21 | +namespace { |
| 22 | +constexpr char16_t REPLACEMENT_CHAR = 0xFFFD; |
| 23 | +constexpr kj::byte REPLACEMENT_UTF8[] = {0xEF, 0xBF, 0xBD}; |
| 24 | + |
| 25 | +struct Holder: public kj::Refcounted { |
| 26 | + kj::Maybe<char16_t> pending = kj::none; |
| 27 | +}; |
| 28 | +} // namespace |
| 29 | + |
| 30 | +// TextEncoderStream encodes a stream of JavaScript strings into UTF-8 bytes. |
| 31 | +// |
| 32 | +// WHATWG Encoding spec requirement (https://encoding.spec.whatwg.org/#interface-textencoderstream): |
| 33 | +// The encoder must handle surrogate pairs that may be split across chunk boundaries. |
| 34 | +// This is tested by WPT's "encoding/streams/encode-utf8.any.js" which includes: |
| 35 | +// - "a character split between chunks should be correctly encoded" test |
| 36 | +// - Input: ["\uD83D", "\uDC99"] (U+1F499 💙 split into high/low surrogate chunks) |
| 37 | +// - Expected output: [0xf0, 0x9f, 0x92, 0x99] (U+1F499 encoded as UTF-8) |
| 38 | +// |
| 39 | +// The main complexity is handling UTF-16 surrogate pairs that may be split across chunks: |
| 40 | +// - JavaScript strings use UTF-16 encoding internally |
| 41 | +// - A surrogate pair consists of a high surrogate (0xD800-0xDBFF) followed by a low surrogate |
| 42 | +// (0xDC00-0xDFFF), representing code points above U+FFFF (e.g., emoji, rare CJK characters) |
| 43 | +// - If a chunk ends with a high surrogate, we must wait for the next chunk to see if it starts |
| 44 | +// with a matching low surrogate before encoding |
| 45 | +// - If no match arrives (chunk starts with non-low-surrogate, or stream ends), the orphaned |
| 46 | +// high surrogate is replaced with U+FFFD (replacement character) |
| 47 | +// |
| 48 | +// State machine: |
| 49 | +// holder->pending = kj::none -> No pending high surrogate from previous chunk |
| 50 | +// holder->pending = char16_t -> High surrogate waiting for a matching low surrogate |
| 51 | +// |
| 52 | +// Transform algorithm for each chunk: |
| 53 | +// 1. Allocate buffer with prefix slot if we have a pending surrogate |
| 54 | +// 2. Write the chunk's UTF-16 code units into the buffer (after the prefix slot) |
| 55 | +// 3. If pending exists: |
| 56 | +// - If chunk starts with low surrogate -> complete the pair (buf[0] = pending lead) |
| 57 | +// - Otherwise -> replace pending with U+FFFD (buf[0] = REPLACEMENT_CHAR) |
| 58 | +// 4. If chunk ends with high surrogate -> save it as pending, exclude from output |
| 59 | +// 5. Sanitize remaining surrogates with simdutf::to_well_formed_utf16 |
| 60 | +// 6. Convert to UTF-8 and enqueue |
| 61 | +// |
| 62 | +// Flush algorithm (when stream closes): |
| 63 | +// - If pending high surrogate exists -> emit U+FFFD (3 UTF-8 bytes: 0xEF 0xBF 0xBD) |
| 64 | +// |
| 65 | +// Ref: https://github.com/web-platform-tests/wpt/blob/master/encoding/streams/encode-utf8.any.js |
14 | 66 | jsg::Ref<TextEncoderStream> TextEncoderStream::constructor(jsg::Lock& js) { |
| 67 | + auto state = kj::rc<Holder>(); |
| 68 | + |
15 | 69 | auto transformer = TransformStream::constructor(js, |
16 | 70 | Transformer{.transform = jsg::Function<Transformer::TransformAlgorithm>( |
17 | | - [](jsg::Lock& js, auto chunk, auto controller) { |
| 71 | + [holder = state.addRef()](jsg::Lock& js, v8::Local<v8::Value> chunk, |
| 72 | + jsg::Ref<TransformStreamDefaultController> controller) mutable { |
18 | 73 | auto str = jsg::check(chunk->ToString(js.v8Context())); |
19 | | - auto utf8Length = str->Utf8LengthV2(js.v8Isolate); |
20 | | - |
21 | | - // Don't emit empty chunks |
22 | | - if (utf8Length == 0) { |
23 | | - return js.resolvedPromise(); |
| 74 | + size_t length = str->Length(); |
| 75 | + |
| 76 | + // Early exit: empty chunk with no pending surrogate produces no output |
| 77 | + if (length == 0 && holder->pending == kj::none) return js.resolvedPromise(); |
| 78 | + |
| 79 | + // Allocate buffer: reserve slot 0 for pending surrogate if we have one |
| 80 | + size_t prefix = (holder->pending != kj::none) ? 1 : 0; |
| 81 | + auto buf = kj::heapArray<char16_t>(prefix + length); |
| 82 | + str->WriteV2(js.v8Isolate, 0, length, reinterpret_cast<uint16_t*>(buf.begin() + prefix)); |
| 83 | + |
| 84 | + // Handle pending high surrogate from previous chunk |
| 85 | + KJ_IF_SOME(lead, holder->pending) { |
| 86 | + KJ_DASSERT(U_IS_LEAD(lead), "pending must be a high surrogate"); |
| 87 | + // Empty chunk: keep pending surrogate for next chunk |
| 88 | + if (length == 0) return js.resolvedPromise(); |
| 89 | + holder->pending = kj::none; |
| 90 | + // If chunk starts with matching low surrogate, complete the pair; otherwise emit U+FFFD |
| 91 | + buf[0] = U_IS_TRAIL(buf[prefix]) ? lead : REPLACEMENT_CHAR; |
24 | 92 | } |
25 | 93 |
|
26 | | - v8::Local<v8::ArrayBuffer> buffer; |
27 | | - JSG_REQUIRE(v8::ArrayBuffer::MaybeNew(js.v8Isolate, utf8Length).ToLocal(&buffer), RangeError, |
28 | | - "Cannot allocate space for TextEncoder.encode"); |
| 94 | + size_t end = prefix + length; |
| 95 | + KJ_DASSERT(end <= buf.size()); |
29 | 96 |
|
30 | | - auto bytes = jsg::asBytes(buffer).releaseAsChars(); |
31 | | - [[maybe_unused]] auto written = str->WriteUtf8V2( |
32 | | - js.v8Isolate, bytes.begin(), bytes.size(), v8::String::WriteFlags::kReplaceInvalidUtf8); |
| 97 | + // If chunk ends with high surrogate, save it for next chunk |
| 98 | + if (end > 0 && U_IS_LEAD(buf[end - 1])) { |
| 99 | + holder->pending = buf[--end]; |
| 100 | + } |
33 | 101 |
|
34 | | - KJ_DASSERT(written == buffer->ByteLength()); |
35 | | - controller->enqueue(js, v8::Uint8Array::New(buffer, 0, buffer->ByteLength())); |
| 102 | + // Nothing to encode after handling surrogates |
| 103 | + if (end == 0) return js.resolvedPromise(); |
| 104 | + |
| 105 | + auto slice = buf.first(end); |
| 106 | + KJ_DASSERT(slice.size() > 0); |
| 107 | + auto result = simdutf::utf8_length_from_utf16_with_replacement(slice.begin(), slice.size()); |
| 108 | + // Only sanitize if there are unpaired surrogates in the middle of the buffer |
| 109 | + if (result.error == simdutf::error_code::SURROGATE) { |
| 110 | + simdutf::to_well_formed_utf16(slice.begin(), slice.size(), slice.begin()); |
| 111 | + } |
| 112 | + auto utf8Length = result.count; |
| 113 | + KJ_DASSERT(utf8Length > 0); |
| 114 | + |
| 115 | + auto backingStore = js.allocBackingStore(utf8Length, jsg::Lock::AllocOption::UNINITIALIZED); |
| 116 | + auto dest = kj::ArrayPtr<char>(static_cast<char*>(backingStore->Data()), utf8Length); |
| 117 | + [[maybe_unused]] auto written = |
| 118 | + simdutf::convert_utf16_to_utf8(slice.begin(), slice.size(), dest.begin()); |
| 119 | + KJ_DASSERT(written == utf8Length, "simdutf should write exactly utf8Length bytes"); |
| 120 | + |
| 121 | + auto array = v8::Uint8Array::New( |
| 122 | + v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8Length); |
| 123 | + controller->enqueue(js, jsg::JsUint8Array(array)); |
| 124 | + return js.resolvedPromise(); |
| 125 | + }), |
| 126 | + .flush = jsg::Function<Transformer::FlushAlgorithm>( |
| 127 | + [holder = state.addRef()]( |
| 128 | + jsg::Lock& js, jsg::Ref<TransformStreamDefaultController> controller) mutable { |
| 129 | + // If stream ends with orphaned high surrogate, emit replacement character |
| 130 | + if (holder->pending != kj::none) { |
| 131 | + auto backingStore = js.allocBackingStore(3, jsg::Lock::AllocOption::UNINITIALIZED); |
| 132 | + memcpy(backingStore->Data(), REPLACEMENT_UTF8, 3); |
| 133 | + auto array = |
| 134 | + v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, 3); |
| 135 | + controller->enqueue(js, jsg::JsUint8Array(array)); |
| 136 | + } |
36 | 137 | return js.resolvedPromise(); |
37 | 138 | })}, |
38 | 139 | StreamQueuingStrategy{}, StreamQueuingStrategy{}); |
|
0 commit comments