perf: faster utf8ToBytes (#94)

rvagg · Sep 12, 2023 · a7e62cb · a7e62cb
1 parent aff9d25
commit a7e62cb
Showing 1 changed file with 29 additions and 86 deletions.
diff --git a/lib/byte-utils.js b/lib/byte-utils.js
@@ -275,101 +275,44 @@ export function compare (b1, b2) {
   return 0
 }
 
-// The below code is mostly taken from https://github.com/feross/buffer
-// Licensed MIT. Copyright (c) Feross Aboukhadijeh
+// The below code is taken from https://github.com/google/closure-library/blob/8598d87242af59aac233270742c8984e2b2bdbe0/closure/goog/crypt/crypt.js#L117-L143
+// Licensed Apache-2.0.
 
 /**
- * @param {string} string
- * @param {number} [units]
+ * @param {string} str
  * @returns {number[]}
  */
-function utf8ToBytes (string, units = Infinity) {
-  let codePoint
-  const length = string.length
-  let leadSurrogate = null
-  const bytes = []
-
-  for (let i = 0; i < length; ++i) {
-    codePoint = string.charCodeAt(i)
-
-    // is surrogate component
-    if (codePoint > 0xd7ff && codePoint < 0xe000) {
-      // last char was a lead
-      if (!leadSurrogate) {
-        // no lead yet
-        /* c8 ignore next 9 */
-        if (codePoint > 0xdbff) {
-          // unexpected trail
-          if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd)
-          continue
-        } else if (i + 1 === length) {
-          // unpaired lead
-          if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd)
-          continue
-        }
-
-        // valid lead
-        leadSurrogate = codePoint
-
-        continue
-      }
-
-      // 2 leads in a row
-      /* c8 ignore next 5 */
-      if (codePoint < 0xdc00) {
-        if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd)
-        leadSurrogate = codePoint
-        continue
-      }
-
-      // valid surrogate pair
-      codePoint = (leadSurrogate - 0xd800 << 10 | codePoint - 0xdc00) + 0x10000
-    /* c8 ignore next 4 */
-    } else if (leadSurrogate) {
-      // valid bmp char, but last char was a lead
-      if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd)
-    }
-
-    leadSurrogate = null
-
-    // encode utf8
-    if (codePoint < 0x80) {
-      /* c8 ignore next 1 */
-      if ((units -= 1) < 0) break
-      bytes.push(codePoint)
-    } else if (codePoint < 0x800) {
-      /* c8 ignore next 1 */
-      if ((units -= 2) < 0) break
-      bytes.push(
-        codePoint >> 0x6 | 0xc0,
-        codePoint & 0x3f | 0x80
-      )
-    } else if (codePoint < 0x10000) {
-      /* c8 ignore next 1 */
-      if ((units -= 3) < 0) break
-      bytes.push(
-        codePoint >> 0xc | 0xe0,
-        codePoint >> 0x6 & 0x3f | 0x80,
-        codePoint & 0x3f | 0x80
-      )
-    /* c8 ignore next 9 */
-    } else if (codePoint < 0x110000) {
-      if ((units -= 4) < 0) break
-      bytes.push(
-        codePoint >> 0x12 | 0xf0,
-        codePoint >> 0xc & 0x3f | 0x80,
-        codePoint >> 0x6 & 0x3f | 0x80,
-        codePoint & 0x3f | 0x80
-      )
+function utf8ToBytes (str) {
+  const out = []
+  let p = 0
+  for (let i = 0; i < str.length; i++) {
+    let c = str.charCodeAt(i)
+    if (c < 128) {
+      out[p++] = c
+    } else if (c < 2048) {
+      out[p++] = (c >> 6) | 192
+      out[p++] = (c & 63) | 128
+    } else if (
+      ((c & 0xFC00) === 0xD800) && (i + 1) < str.length &&
+      ((str.charCodeAt(i + 1) & 0xFC00) === 0xDC00)) {
+      // Surrogate Pair
+      c = 0x10000 + ((c & 0x03FF) << 10) + (str.charCodeAt(++i) & 0x03FF)
+      out[p++] = (c >> 18) | 240
+      out[p++] = ((c >> 12) & 63) | 128
+      out[p++] = ((c >> 6) & 63) | 128
+      out[p++] = (c & 63) | 128
     } else {
-      /* c8 ignore next 2 */
-      throw new Error('Invalid code point')
+      out[p++] = (c >> 12) | 224
+      out[p++] = ((c >> 6) & 63) | 128
+      out[p++] = (c & 63) | 128
     }
   }
-
-  return bytes
+  return out
 }
 
+// The below code is mostly taken from https://github.com/feross/buffer
+// Licensed MIT. Copyright (c) Feross Aboukhadijeh
+
 /**
  * @param {Uint8Array} buf
  * @param {number} offset