Skip to content
This repository has been archived by the owner on Oct 15, 2020. It is now read-only.

Commit

Permalink
[Merge chakra-core/ChakraCore@7b7ddfe8f3] [MERGE #3609 @MSLaguana] Ch…
Browse files Browse the repository at this point in the history
…anging utf8 conversion codex to improve perf and safety

Merge pull request #3609 from MSLaguana:utfConversionWithoutAllocation

With this change the utf8Codex becomes aware of the size of the buffer it
is given to write into, and will assert if the buffer it too small.

utf8Codex now also supports just counting the size that a utf8 string
would be without needing a buffer to write to.

To make use of this, the Jsrt JsCopyString method has been modified so that
it can be used to extract a string with only one allocation, down from ~3.

This has a ~1.3% perf gain in node-chakracore acmeair
  • Loading branch information
chakrabot authored and MSLaguana committed Sep 25, 2017
1 parent e1adb29 commit 9b071a2
Show file tree
Hide file tree
Showing 16 changed files with 270 additions and 157 deletions.
9 changes: 9 additions & 0 deletions deps/chakrashim/core/bin/NativeTests/CodexAssert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,12 @@ void CodexAssert(bool condition)
condition;
Assert(condition);
}

void CodexAssertOrFailFast(bool condition)
{
Assert(condition);
if (!condition)
{
TerminateProcess(GetCurrentProcess(), (UINT)DBG_TERMINATE_PROCESS);
}
}
4 changes: 2 additions & 2 deletions deps/chakrashim/core/bin/ch/ChakraRtInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ struct JsAPIHooks
typedef JsErrorCode(WINAPI *JsrtSerialize)(JsValueRef script, JsValueRef *buffer, JsParseScriptAttributes parseAttributes);
typedef JsErrorCode(WINAPI *JsrtRunSerialized)(JsValueRef buffer, JsSerializedLoadScriptCallback scriptLoadCallback, JsSourceContext sourceContext, JsValueRef sourceUrl, JsValueRef * result);
typedef JsErrorCode(WINAPI *JsrtGetStringLength)(JsValueRef value, int *stringLength);
typedef JsErrorCode(WINAPI *JsrtCopyString)(JsValueRef value, char* buffer, size_t bufferSize, size_t* writtenLength, size_t* actualLength);
typedef JsErrorCode(WINAPI *JsrtCopyString)(JsValueRef value, char* buffer, size_t bufferSize, size_t* length);
typedef JsErrorCode(WINAPI *JsrtCreateString)(const char *content, size_t length, JsValueRef *value);
typedef JsErrorCode(WINAPI *JsrtCreateStringUtf16)(const uint16_t *content, size_t length, JsValueRef *value);

Expand Down Expand Up @@ -399,7 +399,7 @@ class ChakraRTInterface
static JsErrorCode WINAPI JsSerialize(JsValueRef script, JsValueRef *buffer, JsParseScriptAttributes parseAttributes) { return HOOK_JS_API(Serialize(script, buffer, parseAttributes)); }
static JsErrorCode WINAPI JsRunSerialized(JsValueRef buffer, JsSerializedLoadScriptCallback scriptLoadCallback, JsSourceContext sourceContext, JsValueRef sourceUrl, JsValueRef * result) { return HOOK_JS_API(RunSerialized(buffer, scriptLoadCallback, sourceContext, sourceUrl, result)); }
static JsErrorCode WINAPI JsGetStringLength(JsValueRef value, int *stringLength) { return HOOK_JS_API(GetStringLength(value, stringLength)); }
static JsErrorCode WINAPI JsCopyString(JsValueRef value, char* buffer, size_t bufferSize, size_t* writtenLength, size_t* actualLength) { return HOOK_JS_API(CopyString(value, buffer, bufferSize, writtenLength, actualLength)); }
static JsErrorCode WINAPI JsCopyString(JsValueRef value, char* buffer, size_t bufferSize, size_t* length) { return HOOK_JS_API(CopyString(value, buffer, bufferSize, length)); }
static JsErrorCode WINAPI JsCreateString(const char *content, size_t length, JsValueRef *value) { return HOOK_JS_API(CreateString(content, length, value)); }
static JsErrorCode WINAPI JsCreateStringUtf16(const uint16_t *content, size_t length, JsValueRef *value) { return HOOK_JS_API(CreateStringUtf16(content, length, value)); }
static JsErrorCode WINAPI JsCreatePropertyId(const char *name, size_t length, JsPropertyIdRef *propertyId) { return HOOK_JS_API(CreatePropertyId(name, length, propertyId)); }
Expand Down
9 changes: 9 additions & 0 deletions deps/chakrashim/core/bin/ch/CodexAssert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,13 @@ void CodexAssert(bool condition)
{
Assert(condition);
}

void CodexAssertOrFailFast(bool condition)
{
Assert(condition);
if (!condition)
{
TerminateProcess(GetCurrentProcess(), (UINT)DBG_TERMINATE_PROCESS);
}
}
#endif
28 changes: 7 additions & 21 deletions deps/chakrashim/core/bin/ch/stdafx.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,38 +206,24 @@ class AutoString
{
strValue = value;
}
int strLen = 0;
size_t writtenLen = 0;
size_t actualLen = 0;
size_t length = 0;
if (errorCode == JsNoError)
{
errorCode = ChakraRTInterface::JsGetStringLength(strValue, &strLen);
errorCode = ChakraRTInterface::JsCopyString(strValue, nullptr, 0, &length);
if (errorCode == JsNoError)
{
// Assume ascii characters
data = (char*)malloc((strLen + 1) * sizeof(char));
errorCode = ChakraRTInterface::JsCopyString(strValue, data, strLen, &writtenLen, &actualLen);
data = (char*)malloc((length + 1) * sizeof(char));
size_t writtenLength = 0;
errorCode = ChakraRTInterface::JsCopyString(strValue, data, length, &writtenLength);
if (errorCode == JsNoError)
{
// If non-ascii, take slow path
if (writtenLen != actualLen)
{
free(data);
data = (char*)malloc((actualLen + 1) * sizeof(char));

errorCode = ChakraRTInterface::JsCopyString(strValue, data, actualLen + 1, &writtenLen, nullptr);
if (errorCode == JsNoError)
{
AssertMsg(actualLen == writtenLen, "If you see this message.. There is something seriously wrong. Good Luck!");

}
}
AssertMsg(length == writtenLength, "Inconsistent length in utf8 encoding");
}
}
}
if (errorCode == JsNoError)
{
*(data + actualLen) = char(0);
*(data + length) = char(0);
}
return errorCode;
}
Expand Down
93 changes: 28 additions & 65 deletions deps/chakrashim/core/lib/Common/Codex/Utf8Codex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
#pragma warning(disable: 4127) // constant expression for template parameter
#endif

extern void CodexAssert(bool condition);

namespace utf8
{
const unsigned int mAlignmentMask = 0x3;
Expand Down Expand Up @@ -333,62 +331,6 @@ namespace utf8
return ch;
}

LPUTF8 EncodeFull(char16 ch, __out_ecount(3) LPUTF8 ptr)
{
if( ch < 0x0080 )
{
// One byte
*ptr++ = static_cast< utf8char_t >(ch);
}
else if( ch < 0x0800 )
{
// Two bytes : 110yyyxx 10xxxxxx
*ptr++ = static_cast<utf8char_t>(ch >> 6) | 0xc0;
*ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80;
}
else
{
// Three bytes : 1110yyyy 10yyyyxx 10xxxxxx
*ptr++ = static_cast<utf8char_t>(ch >> 12) | 0xE0;
*ptr++ = static_cast<utf8char_t>((ch >> 6) & 0x3F) | 0x80;
*ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80;
}

return ptr;
}

_Use_decl_annotations_
LPUTF8 EncodeSurrogatePair(char16 surrogateHigh, char16 surrogateLow, LPUTF8 ptr)
{
// A unicode codepoint is encoded into a surrogate pair by doing the following:
// subtract 0x10000 from the codepoint
// Split the resulting value into the high-ten bits and low-ten bits
// Add 0xD800 to the high ten bits, and 0xDC00 to the low ten bits
// Below, we want to decode the surrogate pair to its original codepoint
// So we do the above process in reverse
uint32 highTen = (surrogateHigh - 0xD800);
uint32 lowTen = (surrogateLow - 0xDC00);
uint32 codepoint = 0x10000 + ((highTen << 10) | lowTen);

// This is the maximum valid unicode codepoint
// This should be ensured anyway since you can't encode a value higher
// than this as a surrogate pair, so we assert this here
CodexAssert(codepoint <= 0x10FFFF);

// Now we need to encode the code point into utf-8
// Codepoints in the range that gets encoded into a surrogate pair
// gets encoded into 4 bytes under utf8
// Since the codepoint can be represented by 21 bits, the encoding
// does the following: first 3 bits in the first byte, the next 6 in the
// second, the next six in the third, and the last six in the 4th byte
*ptr++ = static_cast<utf8char_t>(codepoint >> 18) | 0xF0;
*ptr++ = static_cast<utf8char_t>((codepoint >> 12) & 0x3F) | 0x80;
*ptr++ = static_cast<utf8char_t>((codepoint >> 6) & 0x3F) | 0x80;
*ptr++ = static_cast<utf8char_t>(codepoint & 0x3F) | 0x80;

return ptr;
}

LPCUTF8 NextCharFull(LPCUTF8 ptr)
{
return ptr + EncodedBytes(*ptr);
Expand Down Expand Up @@ -489,13 +431,15 @@ namespace utf8
return true;
}

template <bool cesu8Encoding>
template <bool cesu8Encoding, bool countBytesOnly>
__range(0, cchIn * 3)
size_t EncodeIntoImpl(__out_ecount(cchIn * 3) LPUTF8 buffer, __in_ecount(cchIn) const char16 *source, charcount_t cchIn)
size_t EncodeIntoImpl(_When_(!countBytesOnly, __out_ecount(cchIn * 3)) LPUTF8 buffer, __in_ecount(cchIn) const char16 *source, charcount_t cchIn, const void* bufferEnd)
{
charcount_t cch = cchIn; // SAL analysis gets confused by EncodeTrueUtf8's dest buffer requirement unless we alias cchIn with a local
LPUTF8 dest = buffer;

CodexAssertOrFailFast(dest <= bufferEnd);

if (!ShouldFastPath(dest, source)) goto LSlowPath;

LFastPath:
Expand All @@ -505,18 +449,24 @@ namespace utf8
if ( (first & 0xFF80FF80) != 0) goto LSlowPath;
uint32 second = ((const uint32 *)source)[1];
if ( (second & 0xFF80FF80) != 0) goto LSlowPath;
*(uint32 *)dest = (first & 0x0000007F) | ((first & 0x007F0000) >> 8) | ((second & 0x0000007f) << 16) | ((second & 0x007F0000) << 8);

if (!countBytesOnly)
{
CodexAssertOrFailFast(dest + 4 <= bufferEnd);
*(uint32 *)dest = (first & 0x0000007F) | ((first & 0x007F0000) >> 8) | ((second & 0x0000007f) << 16) | ((second & 0x007F0000) << 8);
}
dest += 4;
source += 4;
cch -= 4;

}

LSlowPath:
if (cesu8Encoding)
{
while (cch-- > 0)
{
dest = Encode(*source++, dest);
dest = Encode<countBytesOnly>(*source++, dest, bufferEnd);
if (ShouldFastPath(dest, source)) goto LFastPath;
}
}
Expand All @@ -528,7 +478,7 @@ namespace utf8
// If the code unit turns out to be the high surrogate in a surrogate pair, then
// EncodeTrueUtf8 will consume the low surrogate code unit too by decrementing cch
// and incrementing source
dest = EncodeTrueUtf8(*source++, &source, &cch, dest);
dest = EncodeTrueUtf8<countBytesOnly>(*source++, &source, &cch, dest, bufferEnd);
if (ShouldFastPath(dest, source)) goto LFastPath;
}
}
Expand All @@ -539,7 +489,7 @@ namespace utf8
__range(0, cch * 3)
size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
{
return EncodeIntoImpl<true>(buffer, source, cch);
return EncodeIntoImpl<true, false>(buffer, source, cch, &buffer[cch*3]);
}

__range(0, cch * 3)
Expand All @@ -553,11 +503,24 @@ namespace utf8
__range(0, cch * 3)
size_t EncodeTrueUtf8IntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
{
size_t result = EncodeIntoImpl<false>(buffer, source, cch);
size_t result = EncodeIntoImpl<false, false>(buffer, source, cch, &buffer[3 * cch]);
buffer[result] = 0;
return result;
}

__range(0, cch * 3)
size_t EncodeTrueUtf8IntoBoundsChecked(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch, const void * bufferEnd)
{
size_t result = EncodeIntoImpl<false, false>(buffer, source, cch, bufferEnd);
return result;
}

__range(0, cch * 3)
size_t CountTrueUtf8(__in_ecount(cch) const char16 *source, charcount_t cch)
{
return EncodeIntoImpl<false, true>(nullptr, source, cch, nullptr);
}

// Convert the character index into a byte index.
size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, charcount_t cchIndex, DecodeOptions options)
{
Expand Down
Loading

0 comments on commit 9b071a2

Please sign in to comment.