Skip to content

Commit f171dbd

Browse files
committed
add buffer.transcode for nodejs_compat
1 parent 3f94280 commit f171dbd

File tree

8 files changed

+322
-24
lines changed

8 files changed

+322
-24
lines changed

src/node/buffer.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import {
1010
SlowBuffer,
1111
isAscii,
1212
isUtf8,
13+
transcode,
1314
} from 'node-internal:internal_buffer';
1415

1516
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
@@ -30,6 +31,7 @@ export {
3031
SlowBuffer,
3132
isAscii,
3233
isUtf8,
34+
transcode,
3335
};
3436

3537
export default {
@@ -46,4 +48,5 @@ export default {
4648
SlowBuffer,
4749
isAscii,
4850
isUtf8,
51+
transcode,
4952
};

src/node/internal/buffer.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,4 @@ export function decode(buffer: Uint8Array, state: Uint8Array): string;
3737
export function flush(state: Uint8Array): string;
3838
export function isAscii(value: ArrayBufferView): boolean;
3939
export function isUtf8(value: ArrayBufferView): boolean;
40+
export function transcode(source: ArrayBufferView, fromEncoding: string, toEncoding: string): ArrayBuffer;

src/node/internal/internal_buffer.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2294,6 +2294,18 @@ export function isUtf8(value: ArrayBufferView) {
22942294
return bufferUtil.isUtf8(value);
22952295
}
22962296

2297+
export function transcode(source: ArrayBufferView, fromEncoding: string, toEncoding: string) {
2298+
const normalizedFromEncoding = normalizeEncoding(fromEncoding);
2299+
if (!Buffer.isEncoding(normalizedFromEncoding)) {
2300+
throw new ERR_UNKNOWN_ENCODING(fromEncoding);
2301+
}
2302+
const normalizedToEncoding = normalizeEncoding(toEncoding);
2303+
if (!Buffer.isEncoding(normalizedToEncoding)) {
2304+
throw new ERR_UNKNOWN_ENCODING(toEncoding);
2305+
}
2306+
return bufferUtil.transcode(source, fromEncoding, toEncoding);
2307+
}
2308+
22972309
export default {
22982310
Buffer,
22992311
constants,

src/workerd/api/node/buffer.c++

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88
#include "buffer-string-search.h"
99
#include <workerd/jsg/buffersource.h>
1010
#include <kj/encoding.h>
11-
#include <algorithm>
11+
#include <kj/array.h>
1212
#include "simdutf.h"
13+
#include "i18n.h"
14+
15+
#include <algorithm>
1316

1417
// These are defined by <sys/byteorder.h> or <netinet/in.h> on some systems.
1518
// To avoid warnings, undefine them before redefining them.
@@ -85,30 +88,20 @@ void SwapBytes(kj::ArrayPtr<kj::byte> bytes) {
8588
}
8689
}
8790

88-
enum class Encoding {
89-
ASCII,
90-
LATIN1,
91-
UTF8,
92-
UTF16LE,
93-
BASE64,
94-
BASE64URL,
95-
HEX,
96-
};
97-
98-
Encoding getEncoding(kj::StringPtr encoding) {
99-
if (encoding == "utf8"_kj) {
91+
Encoding getEncoding(kj::StringPtr input) {
92+
if (input == "utf8"_kj) {
10093
return Encoding::UTF8;
101-
} else if (encoding == "ascii") {
94+
} else if (input == "ascii"_kj) {
10295
return Encoding::ASCII;
103-
} else if (encoding == "latin1") {
96+
} else if (input == "latin1"_kj) {
10497
return Encoding::LATIN1;
105-
} else if (encoding == "utf16le") {
98+
} else if (input == "utf16le"_kj) {
10699
return Encoding::UTF16LE;
107-
} else if (encoding == "base64") {
100+
} else if (input == "base64"_kj) {
108101
return Encoding::BASE64;
109-
} else if (encoding == "base64url") {
102+
} else if (input == "base64url"_kj) {
110103
return Encoding::BASE64URL;
111-
} else if (encoding == "hex") {
104+
} else if (input == "hex"_kj) {
112105
return Encoding::HEX;
113106
}
114107

@@ -137,7 +130,7 @@ kj::Array<byte> decodeHexTruncated(kj::ArrayPtr<kj::byte> text, bool strict = fa
137130
}
138131
text = text.slice(0, text.size() - 1);
139132
}
140-
kj::Vector vec = kj::Vector<kj::byte>(text.size() / 2);
133+
auto vec = kj::Vector<kj::byte>(text.size() / 2);
141134

142135
for (size_t i = 0; i < text.size(); i += 2) {
143136
byte b = 0;
@@ -216,8 +209,9 @@ uint32_t writeInto(
216209
dest.first(amountToCopy).copyFrom(bytes.first(amountToCopy));
217210
return amountToCopy;
218211
}
212+
default:
213+
KJ_UNREACHABLE;
219214
}
220-
KJ_UNREACHABLE;
221215
}
222216

223217
kj::Array<kj::byte> decodeStringImpl(
@@ -272,8 +266,9 @@ kj::Array<kj::byte> decodeStringImpl(
272266
string.writeInto(js, buf, options);
273267
return decodeHexTruncated(buf, strict);
274268
}
269+
default:
270+
KJ_UNREACHABLE;
275271
}
276-
KJ_UNREACHABLE;
277272
}
278273
} // namespace
279274

@@ -561,8 +556,9 @@ jsg::JsString toStringImpl(
561556
case Encoding::HEX: {
562557
return js.str(kj::encodeHex(slice));
563558
}
559+
default:
560+
KJ_UNREACHABLE;
564561
}
565-
KJ_UNREACHABLE;
566562
}
567563

568564
} // namespace
@@ -673,7 +669,7 @@ inline kj::byte* getIncompleteCharacterBuffer(kj::ArrayPtr<kj::byte> state) {
673669
return state.begin() + BufferUtil::kIncompleteCharactersStart;
674670
}
675671

676-
inline Encoding getEncoding(kj::ArrayPtr<kj::byte> state) {
672+
Encoding getEncoding(kj::ArrayPtr<kj::byte> state) {
677673
JSG_REQUIRE(state[BufferUtil::kEncoding] <= static_cast<kj::byte>(Encoding::HEX),
678674
Error, "Invalid StringDecoder state");
679675
return static_cast<Encoding>(state[BufferUtil::kEncoding]);
@@ -876,5 +872,16 @@ bool BufferUtil::isUtf8(kj::Array<kj::byte> buffer) {
876872
return simdutf::validate_utf8(buffer.asChars().begin(), buffer.size());
877873
}
878874

875+
kj::Array<kj::byte> BufferUtil::transcode(kj::Array<kj::byte> source, kj::String rawFromEncoding, kj::String rawToEncoding) {
876+
auto fromEncoding = getEncoding(rawFromEncoding);
877+
auto toEncoding = getEncoding(rawToEncoding);
878+
879+
JSG_REQUIRE(i18n::canBeTranscoded(fromEncoding) &&
880+
i18n::canBeTranscoded(toEncoding), Error,
881+
"Unable to transcode Buffer");
882+
883+
return i18n::transcode(source, fromEncoding, toEncoding);
884+
}
885+
879886
} // namespace workerd::api::node {
880887

src/workerd/api/node/buffer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ class BufferUtil final: public jsg::Object {
8181
jsg::JsString flush(jsg::Lock& js, kj::Array<kj::byte> state);
8282
bool isAscii(kj::Array<kj::byte> bytes);
8383
bool isUtf8(kj::Array<kj::byte> bytes);
84+
kj::Array<kj::byte> transcode(kj::Array<kj::byte> source,
85+
kj::String rawFromEncoding,
86+
kj::String rawToEncoding);
8487

8588
JSG_RESOURCE_TYPE(BufferUtil) {
8689
JSG_METHOD(byteLength);
@@ -94,6 +97,7 @@ class BufferUtil final: public jsg::Object {
9497
JSG_METHOD(write);
9598
JSG_METHOD(isAscii);
9699
JSG_METHOD(isUtf8);
100+
JSG_METHOD(transcode);
97101

98102
// For StringDecoder
99103
JSG_METHOD(decode);

src/workerd/api/node/i18n.c++

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
// Copyright (c) 2017-2022 Cloudflare, Inc.
2+
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
3+
// https://opensource.org/licenses/Apache-2.0
4+
// Copyright Joyent and Node contributors. All rights reserved. MIT license.
5+
6+
#include "i18n.h"
7+
8+
#include <workerd/jsg/exception.h>
9+
10+
#include <unicode/putil.h>
11+
#include <unicode/timezone.h>
12+
#include <unicode/uchar.h>
13+
#include <unicode/uclean.h>
14+
#include <unicode/ucnv.h>
15+
#include <unicode/udata.h>
16+
#include <unicode/uidna.h>
17+
#include <unicode/ulocdata.h>
18+
#include <unicode/urename.h>
19+
#include <unicode/ustring.h>
20+
#include <unicode/utf16.h>
21+
#include <unicode/utf8.h>
22+
#include <unicode/utypes.h>
23+
#include <unicode/uvernum.h>
24+
#include <unicode/uversion.h>
25+
26+
namespace workerd::api::node {
27+
28+
namespace i18n {
29+
30+
namespace {
31+
32+
struct ConverterDisposer : public kj::Disposer {
33+
static const ConverterDisposer INSTANCE;
34+
void disposeImpl(void* pointer) const override {
35+
ucnv_close(reinterpret_cast<UConverter*>(pointer));
36+
}
37+
};
38+
39+
const ConverterDisposer ConverterDisposer::INSTANCE;
40+
41+
const char* getEncodingName(Encoding input) {
42+
switch (input) {
43+
case Encoding::ASCII:
44+
return "us-ascii";
45+
case Encoding::LATIN1:
46+
return "iso8859-1";
47+
case Encoding::UCS2:
48+
return "utf16le";
49+
case Encoding::UTF8:
50+
return "utf-8";
51+
default:
52+
KJ_UNREACHABLE;
53+
}
54+
}
55+
56+
typedef kj::Maybe<kj::Array<kj::byte>> (*TranscodeImpl)(kj::ArrayPtr<kj::byte> source,
57+
Encoding fromEncoding, Encoding toEncoding);
58+
59+
kj::Maybe<kj::Array<kj::byte>> TranscodeDefault(kj::ArrayPtr<kj::byte> source,
60+
Encoding fromEncoding, Encoding toEncoding) {
61+
Converter to(toEncoding);
62+
std::string substitude(to.minSize(), '?');
63+
to.setSubstitudeChars(substitude);
64+
Converter from(fromEncoding);
65+
66+
auto limit = source.size() + to.maxSize();
67+
auto out = kj::heapArray<kj::byte>(limit);
68+
char* target = out.asChars().begin();
69+
const char* source_ = source.asChars().begin();
70+
UErrorCode status{};
71+
ucnv_convertEx(to.conv(), from.conv(), &target, target + limit, &source_, source_ + source.size(),
72+
nullptr, nullptr, nullptr, nullptr, true, true, &status);
73+
if (U_SUCCESS(status)) {
74+
return out.slice(0, target - out.asChars().begin()).attach(kj::mv(out));
75+
}
76+
77+
return kj::none;
78+
}
79+
80+
kj::Maybe<kj::Array<kj::byte>> TranscodeToUCS2(kj::ArrayPtr<kj::byte> source, Encoding fromEncoding,
81+
Encoding toEncoding) {
82+
UErrorCode status{};
83+
const size_t length_in_chars = source.size() * sizeof(UChar);
84+
Converter from(fromEncoding);
85+
auto out = kj::heapArray<UChar>(source.size());
86+
const auto source_ = source.asChars().begin();
87+
ucnv_toUChars(from.conv(), out.begin(), length_in_chars, source_, source.size(), &status);
88+
if (U_SUCCESS(status)) {
89+
return out.asBytes().attach(kj::mv(out));
90+
}
91+
return kj::none;
92+
}
93+
94+
kj::Maybe<kj::Array<kj::byte>> TranscodeFromUCS2(kj::ArrayPtr<kj::byte> source,
95+
Encoding fromEncoding, Encoding toEncoding) {
96+
UErrorCode status{};
97+
KJ_STACK_ARRAY(kj::byte, sourcebuf, 1024, 0, 1024);
98+
Converter to(toEncoding);
99+
std::string substitude(to.minSize(), '?');
100+
to.setSubstitudeChars(substitude);
101+
102+
const size_t length_in_chars = source.size() * sizeof(UChar);
103+
sourcebuf.copyFrom(source.slice(0, length_in_chars));
104+
105+
auto destbuf = kj::heapArray<kj::byte>(length_in_chars);
106+
const auto source_ = reinterpret_cast<const UChar*>(sourcebuf.asChars().begin());
107+
auto len = ucnv_fromUChars(to.conv(), destbuf.asChars().begin(), length_in_chars, source_,
108+
length_in_chars, &status);
109+
110+
if (U_SUCCESS(status)) {
111+
return destbuf.slice(0, len).attach(kj::mv(destbuf));
112+
}
113+
114+
return kj::none;
115+
}
116+
117+
kj::Maybe<kj::Array<kj::byte>> TranscodeUcs2FromUtf8(kj::ArrayPtr<kj::byte> source,
118+
Encoding fromEncoding, Encoding toEncoding) {
119+
return kj::none;
120+
}
121+
122+
kj::Maybe<kj::Array<kj::byte>> TranscodeUtf8FromUcs2(kj::ArrayPtr<kj::byte> source,
123+
Encoding fromEncoding, Encoding toEncoding) {
124+
return kj::none;
125+
}
126+
127+
} // namespace
128+
129+
Converter::Converter(Encoding encoding, kj::StringPtr substitude) {
130+
UErrorCode status = U_ZERO_ERROR;
131+
auto name = getEncodingName(encoding);
132+
auto conv = ucnv_open(name, &status);
133+
KJ_ASSERT(U_SUCCESS(status));
134+
conv_ = kj::Own<UConverter>(conv, ConverterDisposer::INSTANCE);
135+
setSubstitudeChars(substitude);
136+
}
137+
138+
Converter::Converter(UConverter* converter, kj::StringPtr substitude)
139+
: conv_(converter, ConverterDisposer::INSTANCE) {
140+
setSubstitudeChars(substitude);
141+
}
142+
143+
UConverter* Converter::conv() const {
144+
return const_cast<UConverter*>(conv_.get());
145+
}
146+
147+
size_t Converter::maxSize() const {
148+
KJ_ASSERT_NONNULL(conv_.get());
149+
return ucnv_getMaxCharSize(conv_.get());
150+
}
151+
152+
size_t Converter::minSize() const {
153+
KJ_ASSERT_NONNULL(conv_.get());
154+
return ucnv_getMinCharSize(conv_.get());
155+
}
156+
157+
void Converter::reset() {
158+
KJ_ASSERT_NONNULL(conv_.get());
159+
ucnv_reset(conv_.get());
160+
}
161+
162+
void Converter::setSubstitudeChars(kj::StringPtr sub) {
163+
KJ_ASSERT_NONNULL(conv_.get());
164+
UErrorCode status = U_ZERO_ERROR;
165+
ucnv_setSubstChars(conv_.get(), sub.begin(), sub.size(), &status);
166+
KJ_ASSERT(U_SUCCESS(status));
167+
}
168+
169+
kj::Array<kj::byte> transcode(kj::ArrayPtr<kj::byte> source, Encoding fromEncoding,
170+
Encoding toEncoding) {
171+
TranscodeImpl transcode_function = &TranscodeDefault;
172+
switch (fromEncoding) {
173+
case Encoding::ASCII:
174+
case Encoding::LATIN1:
175+
if (toEncoding == Encoding::UCS2) {
176+
transcode_function = &TranscodeToUCS2;
177+
}
178+
break;
179+
case Encoding::UTF8:
180+
if (toEncoding == Encoding::UCS2) {
181+
transcode_function = &TranscodeUcs2FromUtf8;
182+
}
183+
break;
184+
case Encoding::UCS2:
185+
switch (toEncoding) {
186+
case Encoding::UCS2:
187+
transcode_function = &TranscodeDefault;
188+
break;
189+
case Encoding::UTF8:
190+
transcode_function = &TranscodeUtf8FromUcs2;
191+
break;
192+
default:
193+
transcode_function = &TranscodeFromUCS2;
194+
}
195+
default:
196+
KJ_UNREACHABLE;
197+
}
198+
199+
return JSG_REQUIRE_NONNULL(transcode_function(source, fromEncoding, toEncoding), Error,
200+
"Unable to transcode buffer");
201+
}
202+
203+
} // namespace i18n
204+
205+
} // namespace workerd::api::node

0 commit comments

Comments
 (0)