Skip to content

Commit fab72b1

Browse files
addaleaxjuanarbol
authored andcommitted
src: use simdutf for converting externalized builtins to UTF-16
Remove the dependency on ICU for this part, as well as the hacky way of converting embedder main sources to UTF-8 via V8 APIs. Allow `UnionBytes` to own the memory its pointing to in order to simplify the code on the `BuiltinLoader` side. PR-URL: #46119 Reviewed-By: Yagiz Nizipli <yagiz@nizipli.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com> Reviewed-By: Darshan Sen <raisinten@gmail.com>
1 parent f3dc432 commit fab72b1

10 files changed

+106
-105
lines changed

configure.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -2077,11 +2077,7 @@ def make_bin_override():
20772077
for builtin in shareable_builtins:
20782078
builtin_id = 'node_shared_builtin_' + builtin.replace('/', '_') + '_path'
20792079
if getattr(options, builtin_id):
2080-
if options.with_intl == 'none':
2081-
option_name = '--shared-builtin-' + builtin + '-path'
2082-
error(option_name + ' is incompatible with --with-intl=none' )
2083-
else:
2084-
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
2080+
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
20852081
else:
20862082
output['variables']['node_builtin_shareable_builtins'] += [shareable_builtins[builtin]]
20872083

node.gyp

+1
Original file line numberDiff line numberDiff line change
@@ -1211,6 +1211,7 @@
12111211
'node_dtrace_header',
12121212
'node_dtrace_ustack',
12131213
'node_dtrace_provider',
1214+
'deps/simdutf/simdutf.gyp:simdutf',
12141215
],
12151216

12161217
'includes': [

src/api/environment.cc

+2-13
Original file line numberDiff line numberDiff line change
@@ -464,21 +464,10 @@ MaybeLocal<Value> LoadEnvironment(
464464
Environment* env,
465465
const char* main_script_source_utf8) {
466466
CHECK_NOT_NULL(main_script_source_utf8);
467-
Isolate* isolate = env->isolate();
468467
return LoadEnvironment(
469-
env,
470-
[&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
471-
// This is a slightly hacky way to convert UTF-8 to UTF-16.
472-
Local<String> str =
473-
String::NewFromUtf8(isolate,
474-
main_script_source_utf8).ToLocalChecked();
475-
auto main_utf16 = std::make_unique<String::Value>(isolate, str);
476-
477-
// TODO(addaleax): Avoid having a global table for all scripts.
468+
env, [&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
478469
std::string name = "embedder_main_" + std::to_string(env->thread_id());
479-
builtins::BuiltinLoader::Add(
480-
name.c_str(), UnionBytes(**main_utf16, main_utf16->length()));
481-
env->set_main_utf16(std::move(main_utf16));
470+
builtins::BuiltinLoader::Add(name.c_str(), main_script_source_utf8);
482471
Realm* realm = env->principal_realm();
483472

484473
// Arguments must match the parameters specified in

src/env-inl.h

-5
Original file line numberDiff line numberDiff line change
@@ -799,11 +799,6 @@ void Environment::RemoveCleanupHook(CleanupQueue::Callback fn, void* arg) {
799799
cleanup_queue_.Remove(fn, arg);
800800
}
801801

802-
void Environment::set_main_utf16(std::unique_ptr<v8::String::Value> str) {
803-
CHECK(!main_utf16_);
804-
main_utf16_ = std::move(str);
805-
}
806-
807802
void Environment::set_process_exit_handler(
808803
std::function<void(Environment*, int)>&& handler) {
809804
process_exit_handler_ = std::move(handler);

src/env.h

-6
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,6 @@ class Environment : public MemoryRetainer {
950950

951951
#endif // HAVE_INSPECTOR
952952

953-
inline void set_main_utf16(std::unique_ptr<v8::String::Value>);
954953
inline void set_process_exit_handler(
955954
std::function<void(Environment*, int)>&& handler);
956955

@@ -1122,11 +1121,6 @@ class Environment : public MemoryRetainer {
11221121

11231122
std::unique_ptr<Realm> principal_realm_ = nullptr;
11241123

1125-
// Keeps the main script source alive is one was passed to LoadEnvironment().
1126-
// We should probably find a way to just use plain `v8::String`s created from
1127-
// the source passed to LoadEnvironment() directly instead.
1128-
std::unique_ptr<v8::String::Value> main_utf16_;
1129-
11301124
// Used by allocate_managed_buffer() and release_managed_buffer() to keep
11311125
// track of the BackingStore for a given pointer.
11321126
std::unordered_map<char*, std::unique_ptr<v8::BackingStore>>

src/node_builtins.cc

+14-12
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "env-inl.h"
44
#include "node_external_reference.h"
55
#include "node_internals.h"
6+
#include "simdutf.h"
67
#include "util-inl.h"
78

89
namespace node {
@@ -32,7 +33,6 @@ BuiltinLoader BuiltinLoader::instance_;
3233

3334
BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
3435
LoadJavaScriptSource();
35-
#if defined(NODE_HAVE_I18N_SUPPORT)
3636
#ifdef NODE_SHARED_BUILTIN_CJS_MODULE_LEXER_LEXER_PATH
3737
AddExternalizedBuiltin(
3838
"internal/deps/cjs-module-lexer/lexer",
@@ -49,7 +49,6 @@ BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
4949
AddExternalizedBuiltin("internal/deps/undici/undici",
5050
STRINGIFY(NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH));
5151
#endif // NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH
52-
#endif // NODE_HAVE_I18N_SUPPORT
5352
}
5453

5554
BuiltinLoader* BuiltinLoader::GetInstance() {
@@ -237,7 +236,6 @@ MaybeLocal<String> BuiltinLoader::LoadBuiltinSource(Isolate* isolate,
237236
#endif // NODE_BUILTIN_MODULES_PATH
238237
}
239238

240-
#if defined(NODE_HAVE_I18N_SUPPORT)
241239
void BuiltinLoader::AddExternalizedBuiltin(const char* id,
242240
const char* filename) {
243241
std::string source;
@@ -249,16 +247,20 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id,
249247
return;
250248
}
251249

252-
icu::UnicodeString utf16 = icu::UnicodeString::fromUTF8(
253-
icu::StringPiece(source.data(), source.length()));
254-
auto source_utf16 = std::make_unique<icu::UnicodeString>(utf16);
255-
Add(id,
256-
UnionBytes(reinterpret_cast<const uint16_t*>((*source_utf16).getBuffer()),
257-
utf16.length()));
258-
// keep source bytes for builtin alive while BuiltinLoader exists
259-
GetInstance()->externalized_source_bytes_.push_back(std::move(source_utf16));
250+
Add(id, source);
251+
}
252+
253+
bool BuiltinLoader::Add(const char* id, std::string_view utf8source) {
254+
size_t expected_u16_length =
255+
simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length());
256+
auto out = std::make_shared<std::vector<uint16_t>>(expected_u16_length);
257+
size_t u16_length = simdutf::convert_utf8_to_utf16le(
258+
utf8source.data(),
259+
utf8source.length(),
260+
reinterpret_cast<char16_t*>(out->data()));
261+
out->resize(u16_length);
262+
return Add(id, UnionBytes(out));
260263
}
261-
#endif // NODE_HAVE_I18N_SUPPORT
262264

263265
// Returns Local<Function> of the compiled module if return_code_cache
264266
// is false (we are only compiling the function).

src/node_builtins.h

+1-8
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@
88
#include <memory>
99
#include <set>
1010
#include <string>
11-
#if defined(NODE_HAVE_I18N_SUPPORT)
12-
#include <unicode/unistr.h>
13-
#endif // NODE_HAVE_I18N_SUPPORT
1411
#include <vector>
1512
#include "node_mutex.h"
1613
#include "node_union_bytes.h"
@@ -59,6 +56,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
5956
static v8::Local<v8::String> GetConfigString(v8::Isolate* isolate);
6057
static bool Exists(const char* id);
6158
static bool Add(const char* id, const UnionBytes& source);
59+
static bool Add(const char* id, std::string_view utf8source);
6260

6361
static bool CompileAllBuiltins(v8::Local<v8::Context> context);
6462
static void RefreshCodeCache(const std::vector<CodeCacheInfo>& in);
@@ -124,18 +122,13 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
124122
static void HasCachedBuiltins(
125123
const v8::FunctionCallbackInfo<v8::Value>& args);
126124

127-
#if defined(NODE_HAVE_I18N_SUPPORT)
128125
static void AddExternalizedBuiltin(const char* id, const char* filename);
129-
#endif // NODE_HAVE_I18N_SUPPORT
130126

131127
static BuiltinLoader instance_;
132128
BuiltinCategories builtin_categories_;
133129
BuiltinSourceMap source_;
134130
BuiltinCodeCacheMap code_cache_;
135131
UnionBytes config_;
136-
#if defined(NODE_HAVE_I18N_SUPPORT)
137-
std::list<std::unique_ptr<icu::UnicodeString>> externalized_source_bytes_;
138-
#endif // NODE_HAVE_I18N_SUPPORT
139132

140133
// Used to synchronize access to the code cache map
141134
Mutex code_cache_mutex_;

src/node_union_bytes.h

+9-55
Original file line numberDiff line numberDiff line change
@@ -11,57 +11,20 @@
1111

1212
namespace node {
1313

14-
class NonOwningExternalOneByteResource
15-
: public v8::String::ExternalOneByteStringResource {
16-
public:
17-
explicit NonOwningExternalOneByteResource(const uint8_t* data, size_t length)
18-
: data_(data), length_(length) {}
19-
~NonOwningExternalOneByteResource() override = default;
20-
21-
const char* data() const override {
22-
return reinterpret_cast<const char*>(data_);
23-
}
24-
size_t length() const override { return length_; }
25-
26-
NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
27-
delete;
28-
NonOwningExternalOneByteResource& operator=(
29-
const NonOwningExternalOneByteResource&) = delete;
30-
31-
private:
32-
const uint8_t* data_;
33-
size_t length_;
34-
};
35-
36-
class NonOwningExternalTwoByteResource
37-
: public v8::String::ExternalStringResource {
38-
public:
39-
explicit NonOwningExternalTwoByteResource(const uint16_t* data, size_t length)
40-
: data_(data), length_(length) {}
41-
~NonOwningExternalTwoByteResource() override = default;
42-
43-
const uint16_t* data() const override { return data_; }
44-
size_t length() const override { return length_; }
45-
46-
NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
47-
delete;
48-
NonOwningExternalTwoByteResource& operator=(
49-
const NonOwningExternalTwoByteResource&) = delete;
50-
51-
private:
52-
const uint16_t* data_;
53-
size_t length_;
54-
};
55-
5614
// Similar to a v8::String, but it's independent from Isolates
5715
// and can be materialized in Isolates as external Strings
58-
// via ToStringChecked. The data pointers are owned by the caller.
16+
// via ToStringChecked.
5917
class UnionBytes {
6018
public:
6119
UnionBytes(const uint16_t* data, size_t length)
6220
: one_bytes_(nullptr), two_bytes_(data), length_(length) {}
6321
UnionBytes(const uint8_t* data, size_t length)
6422
: one_bytes_(data), two_bytes_(nullptr), length_(length) {}
23+
template <typename T> // T = uint8_t or uint16_t
24+
explicit UnionBytes(std::shared_ptr<std::vector</*const*/ T>> data)
25+
: UnionBytes(data->data(), data->size()) {
26+
owning_ptr_ = data;
27+
}
6528

6629
UnionBytes(const UnionBytes&) = default;
6730
UnionBytes& operator=(const UnionBytes&) = default;
@@ -77,23 +40,14 @@ class UnionBytes {
7740
CHECK_NOT_NULL(one_bytes_);
7841
return one_bytes_;
7942
}
80-
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const {
81-
if (is_one_byte()) {
82-
NonOwningExternalOneByteResource* source =
83-
new NonOwningExternalOneByteResource(one_bytes_data(), length_);
84-
return v8::String::NewExternalOneByte(isolate, source).ToLocalChecked();
85-
} else {
86-
NonOwningExternalTwoByteResource* source =
87-
new NonOwningExternalTwoByteResource(two_bytes_data(), length_);
88-
return v8::String::NewExternalTwoByte(isolate, source).ToLocalChecked();
89-
}
90-
}
91-
size_t length() { return length_; }
43+
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const;
44+
size_t length() const { return length_; }
9245

9346
private:
9447
const uint8_t* one_bytes_;
9548
const uint16_t* two_bytes_;
9649
size_t length_;
50+
std::shared_ptr<void> owning_ptr_;
9751
};
9852

9953
} // namespace node

src/util.cc

+62
Original file line numberDiff line numberDiff line change
@@ -475,4 +475,66 @@ void SetConstructorFunction(Local<v8::Context> context,
475475
that->Set(context, name, tmpl->GetFunction(context).ToLocalChecked()).Check();
476476
}
477477

478+
namespace {
479+
480+
class NonOwningExternalOneByteResource
481+
: public v8::String::ExternalOneByteStringResource {
482+
public:
483+
explicit NonOwningExternalOneByteResource(const UnionBytes& source)
484+
: source_(source) {}
485+
~NonOwningExternalOneByteResource() override = default;
486+
487+
const char* data() const override {
488+
return reinterpret_cast<const char*>(source_.one_bytes_data());
489+
}
490+
size_t length() const override { return source_.length(); }
491+
492+
NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
493+
delete;
494+
NonOwningExternalOneByteResource& operator=(
495+
const NonOwningExternalOneByteResource&) = delete;
496+
497+
private:
498+
const UnionBytes source_;
499+
};
500+
501+
class NonOwningExternalTwoByteResource
502+
: public v8::String::ExternalStringResource {
503+
public:
504+
explicit NonOwningExternalTwoByteResource(const UnionBytes& source)
505+
: source_(source) {}
506+
~NonOwningExternalTwoByteResource() override = default;
507+
508+
const uint16_t* data() const override { return source_.two_bytes_data(); }
509+
size_t length() const override { return source_.length(); }
510+
511+
NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
512+
delete;
513+
NonOwningExternalTwoByteResource& operator=(
514+
const NonOwningExternalTwoByteResource&) = delete;
515+
516+
private:
517+
const UnionBytes source_;
518+
};
519+
520+
} // anonymous namespace
521+
522+
Local<String> UnionBytes::ToStringChecked(Isolate* isolate) const {
523+
if (UNLIKELY(length() == 0)) {
524+
// V8 requires non-null data pointers for empty external strings,
525+
// but we don't guarantee that. Solve this by not creating an
526+
// external string at all in that case.
527+
return String::Empty(isolate);
528+
}
529+
if (is_one_byte()) {
530+
NonOwningExternalOneByteResource* source =
531+
new NonOwningExternalOneByteResource(*this);
532+
return String::NewExternalOneByte(isolate, source).ToLocalChecked();
533+
} else {
534+
NonOwningExternalTwoByteResource* source =
535+
new NonOwningExternalTwoByteResource(*this);
536+
return String::NewExternalTwoByte(isolate, source).ToLocalChecked();
537+
}
538+
}
539+
478540
} // namespace node

test/cctest/test_util.cc

+16-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
#include "util-inl.h"
21
#include "debug_utils-inl.h"
32
#include "env-inl.h"
43
#include "gtest/gtest.h"
4+
#include "simdutf.h"
5+
#include "util-inl.h"
56

67
using node::Calloc;
78
using node::Malloc;
@@ -298,3 +299,17 @@ TEST(UtilTest, SPrintF) {
298299
const std::string with_zero = std::string("a") + '\0' + 'b';
299300
EXPECT_EQ(SPrintF("%s", with_zero), with_zero);
300301
}
302+
303+
TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) {
304+
// In simdutf, "LE" does *not* refer to Little Endian, it refers
305+
// to 16-byte code units that are stored using *host* endianness.
306+
// This is weird and confusing naming, and so we add this assertion
307+
// here to verify that this is actually the case (so that CI tells
308+
// us if it changed, because for most people Little Endian is
309+
// host endianness, so locally everything would work fine).
310+
const char utf8source[] = "\xe7\x8c\xab";
311+
char16_t u16output;
312+
size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output);
313+
EXPECT_EQ(u16len, 1u);
314+
EXPECT_EQ(u16output, 0x732B);
315+
}

0 commit comments

Comments
 (0)