[Merge chakra-core/ChakraCore@7b7ddfe8f3] [MERGE #3609 @MSLaguana] Ch…

…anging utf8 conversion codex to improve perf and safety Merge pull request #3609 from MSLaguana:utfConversionWithoutAllocation With this change the utf8Codex becomes aware of the size of the buffer it is given to write into, and will assert if the buffer it too small. utf8Codex now also supports just counting the size that a utf8 string would be without needing a buffer to write to. To make use of this, the Jsrt JsCopyString method has been modified so that it can be used to extract a string with only one allocation, down from ~3. This has a ~1.3% perf gain in node-chakracore acmeair
nodejs · Sep 25, 2017 · 9b071a2 · 9b071a2
1 parent e1adb29
commit 9b071a2
Show file tree

Hide file tree

Showing 16 changed files with 270 additions and 157 deletions.
diff --git a/deps/chakrashim/core/bin/NativeTests/CodexAssert.cpp b/deps/chakrashim/core/bin/NativeTests/CodexAssert.cpp
@@ -11,3 +11,12 @@ void CodexAssert(bool condition)
     condition;
     Assert(condition);
 }
+
+void CodexAssertOrFailFast(bool condition)
+{
+    Assert(condition);
+    if (!condition)
+    {
+        TerminateProcess(GetCurrentProcess(), (UINT)DBG_TERMINATE_PROCESS);
+    }
+}
diff --git a/deps/chakrashim/core/bin/ch/ChakraRtInterface.h b/deps/chakrashim/core/bin/ch/ChakraRtInterface.h
@@ -78,7 +78,7 @@ struct JsAPIHooks
     typedef JsErrorCode(WINAPI *JsrtSerialize)(JsValueRef script, JsValueRef *buffer, JsParseScriptAttributes parseAttributes);
     typedef JsErrorCode(WINAPI *JsrtRunSerialized)(JsValueRef buffer, JsSerializedLoadScriptCallback scriptLoadCallback, JsSourceContext sourceContext, JsValueRef sourceUrl, JsValueRef * result);
     typedef JsErrorCode(WINAPI *JsrtGetStringLength)(JsValueRef value, int *stringLength);
-    typedef JsErrorCode(WINAPI *JsrtCopyString)(JsValueRef value, char* buffer, size_t bufferSize, size_t* writtenLength, size_t* actualLength);
+    typedef JsErrorCode(WINAPI *JsrtCopyString)(JsValueRef value, char* buffer, size_t bufferSize, size_t* length);
     typedef JsErrorCode(WINAPI *JsrtCreateString)(const char *content, size_t length, JsValueRef *value);
     typedef JsErrorCode(WINAPI *JsrtCreateStringUtf16)(const uint16_t *content, size_t length, JsValueRef *value);
 
@@ -399,7 +399,7 @@ class ChakraRTInterface
     static JsErrorCode WINAPI JsSerialize(JsValueRef script, JsValueRef *buffer, JsParseScriptAttributes parseAttributes) { return HOOK_JS_API(Serialize(script, buffer, parseAttributes)); }
     static JsErrorCode WINAPI JsRunSerialized(JsValueRef buffer, JsSerializedLoadScriptCallback scriptLoadCallback, JsSourceContext sourceContext, JsValueRef sourceUrl, JsValueRef * result) { return HOOK_JS_API(RunSerialized(buffer, scriptLoadCallback, sourceContext, sourceUrl, result)); }
     static JsErrorCode WINAPI JsGetStringLength(JsValueRef value, int *stringLength) { return HOOK_JS_API(GetStringLength(value, stringLength)); }
-    static JsErrorCode WINAPI JsCopyString(JsValueRef value, char* buffer, size_t bufferSize, size_t* writtenLength, size_t* actualLength) { return HOOK_JS_API(CopyString(value, buffer, bufferSize, writtenLength, actualLength)); }
+    static JsErrorCode WINAPI JsCopyString(JsValueRef value, char* buffer, size_t bufferSize, size_t* length) { return HOOK_JS_API(CopyString(value, buffer, bufferSize, length)); }
     static JsErrorCode WINAPI JsCreateString(const char *content, size_t length, JsValueRef *value) { return HOOK_JS_API(CreateString(content, length, value)); }
     static JsErrorCode WINAPI JsCreateStringUtf16(const uint16_t *content, size_t length, JsValueRef *value) { return HOOK_JS_API(CreateStringUtf16(content, length, value)); }
     static JsErrorCode WINAPI JsCreatePropertyId(const char *name, size_t length, JsPropertyIdRef *propertyId) { return HOOK_JS_API(CreatePropertyId(name, length, propertyId)); }

diff --git a/deps/chakrashim/core/bin/ch/CodexAssert.cpp b/deps/chakrashim/core/bin/ch/CodexAssert.cpp
@@ -11,4 +11,13 @@ void CodexAssert(bool condition)
 {
     Assert(condition);
 }
+
+void CodexAssertOrFailFast(bool condition)
+{
+    Assert(condition);
+    if (!condition)
+    {
+        TerminateProcess(GetCurrentProcess(), (UINT)DBG_TERMINATE_PROCESS);
+    }
+}
 #endif
diff --git a/deps/chakrashim/core/bin/ch/stdafx.h b/deps/chakrashim/core/bin/ch/stdafx.h
@@ -206,38 +206,24 @@ class AutoString
         {
             strValue = value;
         }
-        int strLen = 0;
-        size_t writtenLen = 0;
-        size_t actualLen = 0;
+        size_t length = 0;
         if (errorCode == JsNoError)
         {
-            errorCode = ChakraRTInterface::JsGetStringLength(strValue, &strLen);
+            errorCode = ChakraRTInterface::JsCopyString(strValue, nullptr, 0, &length);
             if (errorCode == JsNoError)
             {
-                // Assume ascii characters
-                data = (char*)malloc((strLen + 1) * sizeof(char));
-                errorCode = ChakraRTInterface::JsCopyString(strValue, data, strLen, &writtenLen, &actualLen);
+                data = (char*)malloc((length + 1) * sizeof(char));
+                size_t writtenLength = 0;
+                errorCode = ChakraRTInterface::JsCopyString(strValue, data, length, &writtenLength);
                 if (errorCode == JsNoError)
                 {
-                    // If non-ascii, take slow path
-                    if (writtenLen != actualLen)
-                    {
-                        free(data);
-                        data = (char*)malloc((actualLen + 1) * sizeof(char));
-
-                        errorCode = ChakraRTInterface::JsCopyString(strValue, data, actualLen + 1, &writtenLen, nullptr);
-                        if (errorCode == JsNoError)
-                        {
-                            AssertMsg(actualLen == writtenLen, "If you see this message.. There is something seriously wrong. Good Luck!");
-
-                        }
-                    }
+                    AssertMsg(length == writtenLength, "Inconsistent length in utf8 encoding");
                 }
             }
         }
         if (errorCode == JsNoError)
         {
-            *(data + actualLen) = char(0);
+            *(data + length) = char(0);
         }
         return errorCode;
     }

diff --git a/deps/chakrashim/core/lib/Common/Codex/Utf8Codex.cpp b/deps/chakrashim/core/lib/Common/Codex/Utf8Codex.cpp
@@ -19,8 +19,6 @@
 #pragma warning(disable: 4127)  // constant expression for template parameter
 #endif
 
-extern void CodexAssert(bool condition);
-
 namespace utf8
 {
     const unsigned int mAlignmentMask = 0x3;
@@ -333,62 +331,6 @@ namespace utf8
         return ch;
     }
 
-    LPUTF8 EncodeFull(char16 ch, __out_ecount(3) LPUTF8 ptr)
-    {
-        if( ch < 0x0080 )
-        {
-            // One byte
-            *ptr++ = static_cast< utf8char_t >(ch);
-        }
-        else if( ch < 0x0800 )
-        {
-            // Two bytes   : 110yyyxx 10xxxxxx
-            *ptr++ = static_cast<utf8char_t>(ch >> 6) | 0xc0;
-            *ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80;
-        }
-        else
-        {
-            // Three bytes : 1110yyyy 10yyyyxx 10xxxxxx
-            *ptr++ = static_cast<utf8char_t>(ch >> 12) | 0xE0;
-            *ptr++ = static_cast<utf8char_t>((ch >> 6) & 0x3F) | 0x80;
-            *ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80;
-        }
-
-        return ptr;
-    }
-
-    _Use_decl_annotations_
-    LPUTF8 EncodeSurrogatePair(char16 surrogateHigh, char16 surrogateLow, LPUTF8 ptr)
-    {
-        // A unicode codepoint is encoded into a surrogate pair by doing the following:
-        //  subtract 0x10000 from the codepoint
-        //  Split the resulting value into the high-ten bits and low-ten bits
-        //  Add 0xD800 to the high ten bits, and 0xDC00 to the low ten bits
-        // Below, we want to decode the surrogate pair to its original codepoint
-        // So we do the above process in reverse
-        uint32 highTen = (surrogateHigh - 0xD800);
-        uint32 lowTen  = (surrogateLow - 0xDC00);
-        uint32 codepoint = 0x10000 + ((highTen << 10) | lowTen);
-
-        // This is the maximum valid unicode codepoint
-        // This should be ensured anyway since you can't encode a value higher
-        // than this as a surrogate pair, so we assert this here
-        CodexAssert(codepoint <= 0x10FFFF);
-
-        // Now we need to encode the code point into utf-8
-        // Codepoints in the range that gets encoded into a surrogate pair
-        // gets encoded into 4 bytes under utf8
-        // Since the codepoint can be represented by 21 bits, the encoding
-        // does the following: first 3 bits in the first byte, the next 6 in the
-        // second, the next six in the third, and the last six in the 4th byte
-        *ptr++ = static_cast<utf8char_t>(codepoint >> 18) | 0xF0;
-        *ptr++ = static_cast<utf8char_t>((codepoint >> 12) & 0x3F) | 0x80;
-        *ptr++ = static_cast<utf8char_t>((codepoint >> 6) & 0x3F) | 0x80;
-        *ptr++ = static_cast<utf8char_t>(codepoint & 0x3F) | 0x80;
-
-        return ptr;
-    }
-
     LPCUTF8 NextCharFull(LPCUTF8 ptr)
     {
         return ptr + EncodedBytes(*ptr);
@@ -489,13 +431,15 @@ namespace utf8
         return true;
     }
 
-    template <bool cesu8Encoding>
+    template <bool cesu8Encoding, bool countBytesOnly>
     __range(0, cchIn * 3)
-    size_t EncodeIntoImpl(__out_ecount(cchIn * 3) LPUTF8 buffer, __in_ecount(cchIn) const char16 *source, charcount_t cchIn)
+    size_t EncodeIntoImpl(_When_(!countBytesOnly, __out_ecount(cchIn * 3)) LPUTF8 buffer, __in_ecount(cchIn) const char16 *source, charcount_t cchIn, const void* bufferEnd)
     {
         charcount_t cch = cchIn; // SAL analysis gets confused by EncodeTrueUtf8's dest buffer requirement unless we alias cchIn with a local
         LPUTF8 dest = buffer;
 
+        CodexAssertOrFailFast(dest <= bufferEnd);
+
         if (!ShouldFastPath(dest, source)) goto LSlowPath;
 
 LFastPath:
@@ -505,18 +449,24 @@ namespace utf8
             if ( (first & 0xFF80FF80) != 0) goto LSlowPath;
             uint32 second = ((const uint32 *)source)[1];
             if ( (second & 0xFF80FF80) != 0) goto LSlowPath;
-            *(uint32 *)dest = (first & 0x0000007F) | ((first & 0x007F0000) >> 8) | ((second & 0x0000007f) << 16) | ((second & 0x007F0000) << 8);
+
+            if (!countBytesOnly)
+            {
+                CodexAssertOrFailFast(dest + 4 <= bufferEnd);
+                *(uint32 *)dest = (first & 0x0000007F) | ((first & 0x007F0000) >> 8) | ((second & 0x0000007f) << 16) | ((second & 0x007F0000) << 8);
+            }
             dest += 4;
             source += 4;
             cch -= 4;
+
         }
 
 LSlowPath:
         if (cesu8Encoding)
         {
             while (cch-- > 0)
             {
-                dest = Encode(*source++, dest);
+                dest = Encode<countBytesOnly>(*source++, dest, bufferEnd);
                 if (ShouldFastPath(dest, source)) goto LFastPath;
             }
         }
@@ -528,7 +478,7 @@ namespace utf8
                 // If the code unit turns out to be the high surrogate in a surrogate pair, then
                 // EncodeTrueUtf8 will consume the low surrogate code unit too by decrementing cch
                 // and incrementing source
-                dest = EncodeTrueUtf8(*source++, &source, &cch, dest);
+                dest = EncodeTrueUtf8<countBytesOnly>(*source++, &source, &cch, dest, bufferEnd);
                 if (ShouldFastPath(dest, source)) goto LFastPath;
             }
         }
@@ -539,7 +489,7 @@ namespace utf8
     __range(0, cch * 3)
         size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
     {
-        return EncodeIntoImpl<true>(buffer, source, cch);
+        return EncodeIntoImpl<true, false>(buffer, source, cch, &buffer[cch*3]);
     }
 
     __range(0, cch * 3)
@@ -553,11 +503,24 @@ namespace utf8
     __range(0, cch * 3)
         size_t EncodeTrueUtf8IntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
     {
-        size_t result = EncodeIntoImpl<false>(buffer, source, cch);
+        size_t result = EncodeIntoImpl<false, false>(buffer, source, cch, &buffer[3 * cch]);
         buffer[result] = 0;
         return result;
     }
 
+    __range(0, cch * 3)
+        size_t EncodeTrueUtf8IntoBoundsChecked(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch, const void * bufferEnd)
+    {
+        size_t result = EncodeIntoImpl<false, false>(buffer, source, cch, bufferEnd);
+        return result;
+    }
+
+    __range(0, cch * 3)
+        size_t CountTrueUtf8(__in_ecount(cch) const char16 *source, charcount_t cch)
+    {
+        return EncodeIntoImpl<false, true>(nullptr, source, cch, nullptr);
+    }
+
     // Convert the character index into a byte index.
     size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, charcount_t cchIndex, DecodeOptions options)
     {