From a7e62cbc3fbb8e514e0211e52949923f46c63cc5 Mon Sep 17 00:00:00 2001 From: Alan Shaw Date: Tue, 12 Sep 2023 03:01:49 +0100 Subject: [PATCH] perf: faster utf8ToBytes (#94) --- lib/byte-utils.js | 115 ++++++++++++---------------------------------- 1 file changed, 29 insertions(+), 86 deletions(-) diff --git a/lib/byte-utils.js b/lib/byte-utils.js index 671d21c..2d81a22 100644 --- a/lib/byte-utils.js +++ b/lib/byte-utils.js @@ -275,101 +275,44 @@ export function compare (b1, b2) { return 0 } -// The below code is mostly taken from https://github.com/feross/buffer -// Licensed MIT. Copyright (c) Feross Aboukhadijeh +// The below code is taken from https://github.com/google/closure-library/blob/8598d87242af59aac233270742c8984e2b2bdbe0/closure/goog/crypt/crypt.js#L117-L143 +// Licensed Apache-2.0. /** - * @param {string} string - * @param {number} [units] + * @param {string} str * @returns {number[]} */ -function utf8ToBytes (string, units = Infinity) { - let codePoint - const length = string.length - let leadSurrogate = null - const bytes = [] - - for (let i = 0; i < length; ++i) { - codePoint = string.charCodeAt(i) - - // is surrogate component - if (codePoint > 0xd7ff && codePoint < 0xe000) { - // last char was a lead - if (!leadSurrogate) { - // no lead yet - /* c8 ignore next 9 */ - if (codePoint > 0xdbff) { - // unexpected trail - if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd) - continue - } else if (i + 1 === length) { - // unpaired lead - if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd) - continue - } - - // valid lead - leadSurrogate = codePoint - - continue - } - - // 2 leads in a row - /* c8 ignore next 5 */ - if (codePoint < 0xdc00) { - if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd) - leadSurrogate = codePoint - continue - } - - // valid surrogate pair - codePoint = (leadSurrogate - 0xd800 << 10 | codePoint - 0xdc00) + 0x10000 - /* c8 ignore next 4 */ - } else if (leadSurrogate) { - // valid bmp char, but last char was a lead - if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd) - } - - leadSurrogate = null - - // encode utf8 - if (codePoint < 0x80) { - /* c8 ignore next 1 */ - if ((units -= 1) < 0) break - bytes.push(codePoint) - } else if (codePoint < 0x800) { - /* c8 ignore next 1 */ - if ((units -= 2) < 0) break - bytes.push( - codePoint >> 0x6 | 0xc0, - codePoint & 0x3f | 0x80 - ) - } else if (codePoint < 0x10000) { - /* c8 ignore next 1 */ - if ((units -= 3) < 0) break - bytes.push( - codePoint >> 0xc | 0xe0, - codePoint >> 0x6 & 0x3f | 0x80, - codePoint & 0x3f | 0x80 - ) - /* c8 ignore next 9 */ - } else if (codePoint < 0x110000) { - if ((units -= 4) < 0) break - bytes.push( - codePoint >> 0x12 | 0xf0, - codePoint >> 0xc & 0x3f | 0x80, - codePoint >> 0x6 & 0x3f | 0x80, - codePoint & 0x3f | 0x80 - ) +function utf8ToBytes (str) { + const out = [] + let p = 0 + for (let i = 0; i < str.length; i++) { + let c = str.charCodeAt(i) + if (c < 128) { + out[p++] = c + } else if (c < 2048) { + out[p++] = (c >> 6) | 192 + out[p++] = (c & 63) | 128 + } else if ( + ((c & 0xFC00) === 0xD800) && (i + 1) < str.length && + ((str.charCodeAt(i + 1) & 0xFC00) === 0xDC00)) { + // Surrogate Pair + c = 0x10000 + ((c & 0x03FF) << 10) + (str.charCodeAt(++i) & 0x03FF) + out[p++] = (c >> 18) | 240 + out[p++] = ((c >> 12) & 63) | 128 + out[p++] = ((c >> 6) & 63) | 128 + out[p++] = (c & 63) | 128 } else { - /* c8 ignore next 2 */ - throw new Error('Invalid code point') + out[p++] = (c >> 12) | 224 + out[p++] = ((c >> 6) & 63) | 128 + out[p++] = (c & 63) | 128 } } - - return bytes + return out } +// The below code is mostly taken from https://github.com/feross/buffer +// Licensed MIT. Copyright (c) Feross Aboukhadijeh + /** * @param {Uint8Array} buf * @param {number} offset