Skip to content

Commit

Permalink
Add utf16 method to JSI (facebook#47356)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebook#47356

Add utf16 method to JSI. This change will add the default implementation
for all VMs by calling UTF8 and manually convert it to UTF16. A later
change will be added for Hermes to use internal VM information to get
the UTF16 string.

Changelog: [Internal]

Reviewed By: neildhar

Differential Revision: D64918244

fbshipit-source-id: 6fc0c44fc397c2f8bb40a4262596b178ee4f1f29
  • Loading branch information
Chi Tsai authored and facebook-github-bot committed Nov 1, 2024
1 parent af384a9 commit 6ab7b70
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 0 deletions.
16 changes: 16 additions & 0 deletions packages/react-native/ReactCommon/jsi/jsi/decorator.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,13 @@ class RuntimeDecorator : public Base, private jsi::Instrumentation {
return plain_.utf8(s);
}

std::u16string utf16(const String& str) override {
return plain_.utf16(str);
}
std::u16string utf16(const PropNameID& sym) override {
return plain_.utf16(sym);
}

Object createObject() override {
return plain_.createObject();
};
Expand Down Expand Up @@ -674,6 +681,15 @@ class WithRuntimeDecorator : public RuntimeDecorator<Plain, Base> {
return RD::utf8(s);
}

std::u16string utf16(const String& str) override {
Around around{with_};
return RD::utf16(str);
}
std::u16string utf16(const PropNameID& sym) override {
Around around{with_};
return RD::utf16(sym);
}

Value createValueFromJsonUtf8(const uint8_t* json, size_t length) override {
Around around{with_};
return RD::createValueFromJsonUtf8(json, length);
Expand Down
111 changes: 111 additions & 0 deletions packages/react-native/ReactCommon/jsi/jsi/jsi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,107 @@ Value callGlobalFunction(Runtime& runtime, const char* name, const Value& arg) {
return f.call(runtime, arg);
}

// Given a sequence of UTF8 encoded bytes, advance the input to past where a
// 32-bit unicode codepoint as been decoded and return the codepoint. If the
// UTF8 encoding is invalid, then return the value with the unicode replacement
// character (U+FFFD). This decoder also relies on zero termination at end of
// the input for bound checks.
// \param input char pointer pointing to the current character
// \return Unicode codepoint
uint32_t decodeUTF8(const char*& input) {
uint32_t ch = (unsigned char)input[0];
if (ch <= 0x7f) {
input += 1;
return ch;
}
uint32_t ret;
constexpr uint32_t replacementCharacter = 0xFFFD;
if ((ch & 0xE0) == 0xC0) {
uint32_t ch1 = (unsigned char)input[1];
if ((ch1 & 0xC0) != 0x80) {
input += 1;
return replacementCharacter;
}
ret = ((ch & 0x1F) << 6) | (ch1 & 0x3F);
input += 2;
if (ret <= 0x7F) {
return replacementCharacter;
}
} else if ((ch & 0xF0) == 0xE0) {
uint32_t ch1 = (unsigned char)input[1];
if ((ch1 & 0x40) != 0 || (ch1 & 0x80) == 0) {
input += 1;
return replacementCharacter;
}
uint32_t ch2 = (unsigned char)input[2];
if ((ch2 & 0x40) != 0 || (ch2 & 0x80) == 0) {
input += 2;
return replacementCharacter;
}
ret = ((ch & 0x0F) << 12) | ((ch1 & 0x3F) << 6) | (ch2 & 0x3F);
input += 3;
if (ret <= 0x7FF) {
return replacementCharacter;
}
} else if ((ch & 0xF8) == 0xF0) {
uint32_t ch1 = (unsigned char)input[1];
if ((ch1 & 0x40) != 0 || (ch1 & 0x80) == 0) {
input += 1;
return replacementCharacter;
}
uint32_t ch2 = (unsigned char)input[2];
if ((ch2 & 0x40) != 0 || (ch2 & 0x80) == 0) {
input += 2;
return replacementCharacter;
}
uint32_t ch3 = (unsigned char)input[3];
if ((ch3 & 0x40) != 0 || (ch3 & 0x80) == 0) {
input += 3;
return replacementCharacter;
}
ret = ((ch & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) |
(ch3 & 0x3F);
input += 4;
if (ret <= 0xFFFF) {
return replacementCharacter;
}
if (ret > 0x10FFFF) {
return replacementCharacter;
}
} else {
input += 1;
return replacementCharacter;
}
return ret;
}

// Given a valid 32-bit unicode codepoint, encode it as UTF-16 into the output.
void encodeUTF16(std::u16string& out, uint32_t cp) {
if (cp < 0x10000) {
out.push_back((uint16_t)cp);
return;
}
cp -= 0x10000;
uint16_t highSurrogate = 0xD800 + ((cp >> 10) & 0x3FF);
out.push_back(highSurrogate);
uint16_t lowSurrogate = 0xDC00 + (cp & 0x3FF);
out.push_back(lowSurrogate);
}

// Convert the UTF8 encoded string into a UTF16 encoded string. If the
// input is not valid UTF8, the replacement character (U+FFFD) is used to
// represent the invalid sequence.
std::u16string convertUTF8ToUTF16(const std::string& utf8) {
std::u16string ret;
const char* curr = utf8.data();
const char* end = curr + utf8.length();
while (curr < end) {
auto cp = decodeUTF8(curr);
encodeUTF16(ret, cp);
}
return ret;
}

} // namespace

Buffer::~Buffer() = default;
Expand Down Expand Up @@ -147,6 +248,16 @@ Value Runtime::createValueFromJsonUtf8(const uint8_t* json, size_t length) {
return parseJson.call(*this, String::createFromUtf8(*this, json, length));
}

std::u16string Runtime::utf16(const PropNameID& sym) {
auto utf8Str = utf8(sym);
return convertUTF8ToUTF16(utf8Str);
}

std::u16string Runtime::utf16(const String& str) {
auto utf8Str = utf8(str);
return convertUTF8ToUTF16(utf8Str);
}

Pointer& Pointer::operator=(Pointer&& other) {
if (ptr_) {
ptr_->invalidate();
Expand Down
13 changes: 13 additions & 0 deletions packages/react-native/ReactCommon/jsi/jsi/jsi.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,9 @@ class JSI_EXPORT Runtime {
const jsi::Object& obj,
size_t amount) = 0;

virtual std::u16string utf16(const String& str);
virtual std::u16string utf16(const PropNameID& sym);

// These exist so derived classes can access the private parts of
// Value, Symbol, String, and Object, which are all friends of Runtime.
template <typename T>
Expand Down Expand Up @@ -501,6 +504,11 @@ class JSI_EXPORT PropNameID : public Pointer {
return runtime.utf8(*this);
}

/// Copies the data in a PropNameID as utf16 into a C++ string.
std::u16string utf16(Runtime& runtime) const {
return runtime.utf16(*this);
}

static bool compare(
Runtime& runtime,
const jsi::PropNameID& a,
Expand Down Expand Up @@ -651,6 +659,11 @@ class JSI_EXPORT String : public Pointer {
return runtime.utf8(*this);
}

/// Copies the data in a JS string as utf16 into a C++ string.
std::u16string utf16(Runtime& runtime) const {
return runtime.utf16(*this);
}

friend class Runtime;
friend class Value;
};
Expand Down
59 changes: 59 additions & 0 deletions packages/react-native/ReactCommon/jsi/jsi/test/testlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1576,6 +1576,65 @@ TEST_P(JSITest, UTF8ExceptionTest) {
}
}

TEST_P(JSITest, UTF16Test) {
// This Runtime Decorator is used to test the conversion from UTF-8 to UTF-16
// in the default utf16 method for runtimes that do not provide their own
// utf16 implementation.
class UTF16RD : public RuntimeDecorator<Runtime, Runtime> {
public:
UTF16RD(Runtime& rt) : RuntimeDecorator(rt) {}

std::string utf8(const String&) override {
return utf8Str;
}

std::u16string utf16(const String& str) override {
return Runtime::utf16(str);
}

std::string utf8Str;
};

UTF16RD rd = UTF16RD(rt);
String str = String::createFromUtf8(rd, "placeholder");

rd.utf8Str = "foobar";
EXPECT_EQ(str.utf16(rd), u"foobar");

rd.utf8Str = "你好";
EXPECT_EQ(str.utf16(rd), u"你好");

rd.utf8Str = "👍";
EXPECT_EQ(str.utf16(rd), u"👍");

rd.utf8Str = "foobar👍你好";
EXPECT_EQ(str.utf16(rd), u"foobar👍你好");

// String ended before second byte of the encoding
rd.utf8Str = "\xcf";
EXPECT_EQ(str.utf16(rd), u"\uFFFD");

// Third byte should follow the pattern of 0b10xxxxxx
rd.utf8Str = "\xef\x8f\x29";
EXPECT_EQ(str.utf16(rd), u"\uFFFD\u0029");

// U+2200 should be encoded in 3 bytes as 0xE2 0x88 0x80, not 4 bytes
rd.utf8Str = "\xf0\x82\x88\x80";
EXPECT_EQ(str.utf16(rd), u"\uFFFD");

// Unicode Max Value is U+10FFFF, U+11FFFF is invalid
rd.utf8Str = "\xf4\x9f\xbf\xbf";
EXPECT_EQ(str.utf16(rd), u"\uFFFD");

// Missing the third byte of the 3-byte encoding, followed by 'z'
rd.utf8Str = "\xe1\xa0\x7a";
EXPECT_EQ(str.utf16(rd), u"\uFFFD\u007A");

// First byte is neither ASCII nor a valid continuation byte
rd.utf8Str = "\xea\x7a";
EXPECT_EQ(str.utf16(rd), u"\uFFFD\u007A");
}

INSTANTIATE_TEST_CASE_P(
Runtimes,
JSITest,
Expand Down

0 comments on commit 6ab7b70

Please sign in to comment.