|
16 | 16 | */ |
17 | 17 |
|
18 | 18 | import { randomBytes } from '../platform/random_bytes'; |
19 | | -import { newTextEncoder } from '../platform/text_serializer'; |
20 | 19 |
|
21 | 20 | import { debugAssert } from './assert'; |
22 | 21 |
|
@@ -77,63 +76,50 @@ export interface Equatable<T> { |
77 | 76 |
|
78 | 77 | /** Compare strings in UTF-8 encoded byte order */ |
79 | 78 | export function compareUtf8Strings(left: string, right: string): number { |
80 | | - let i = 0; |
81 | | - while (i < left.length && i < right.length) { |
82 | | - const leftCodePoint = left.codePointAt(i)!; |
83 | | - const rightCodePoint = right.codePointAt(i)!; |
84 | | - |
85 | | - if (leftCodePoint !== rightCodePoint) { |
86 | | - if (leftCodePoint < 128 && rightCodePoint < 128) { |
87 | | - // ASCII comparison |
88 | | - return primitiveComparator(leftCodePoint, rightCodePoint); |
89 | | - } else { |
90 | | - // Lazy instantiate TextEncoder |
91 | | - const encoder = newTextEncoder(); |
92 | | - |
93 | | - // UTF-8 encode the character at index i for byte comparison. |
94 | | - const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i)); |
95 | | - const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i)); |
96 | | - |
97 | | - const comp = compareByteArrays(leftBytes, rightBytes); |
98 | | - if (comp !== 0) { |
99 | | - return comp; |
100 | | - } else { |
101 | | - // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte |
102 | | - // representations are identical. This can happen with malformed input |
103 | | - // (invalid surrogate pairs). The backend also actively prevents invalid |
104 | | - // surrogates as INVALID_ARGUMENT errors, so we almost never receive |
105 | | - // invalid strings from backend. |
106 | | - // Fallback to code point comparison for graceful handling. |
107 | | - return primitiveComparator(leftCodePoint, rightCodePoint); |
108 | | - } |
109 | | - } |
| 79 | + // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and, |
| 80 | + // if found, use that character to determine the relative ordering of the two strings as a |
| 81 | + // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by |
| 82 | + // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8 |
| 83 | + // and UTF-16 happen to represent Unicode code points. |
| 84 | + // |
| 85 | + // After finding the first pair of differing characters, there are two cases: |
| 86 | + // |
| 87 | + // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or |
| 88 | + // both are surrogates from a surrogate pair (that collectively represent code points greater |
| 89 | + // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the |
| 90 | + // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is |
| 91 | + // sufficient. |
| 92 | + // |
| 93 | + // Case 2: One character is a surrogate and the other is not. In this case the surrogate- |
| 94 | + // containing string is always ordered after the non-surrogate. This is because surrogates are |
| 95 | + // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations |
| 96 | + // and are lexicographically greater than the 1, 2, or 3-byte representations of code points |
| 97 | + // less than or equal to 0xFFFF. |
| 98 | + const length = Math.min(left.length, right.length); |
| 99 | + for (let i = 0; i < length; i++) { |
| 100 | + const leftChar = left.charAt(i); |
| 101 | + const rightChar = right.charAt(i); |
| 102 | + if (leftChar !== rightChar) { |
| 103 | + return isSurrogate(leftChar) === isSurrogate(rightChar) |
| 104 | + ? primitiveComparator(leftChar, rightChar) |
| 105 | + : isSurrogate(leftChar) |
| 106 | + ? 1 |
| 107 | + : -1; |
110 | 108 | } |
111 | | - // Increment by 2 for surrogate pairs, 1 otherwise |
112 | | - i += leftCodePoint > 0xffff ? 2 : 1; |
113 | 109 | } |
114 | 110 |
|
115 | | - // Compare lengths if all characters are equal |
| 111 | + // Use the lengths of the strings to determine the overall comparison result since either the |
| 112 | + // strings were equal or one is a prefix of the other. |
116 | 113 | return primitiveComparator(left.length, right.length); |
117 | 114 | } |
118 | 115 |
|
119 | | -function getUtf8SafeSubstring(str: string, index: number): string { |
120 | | - const firstCodePoint = str.codePointAt(index)!; |
121 | | - if (firstCodePoint > 0xffff) { |
122 | | - // It's a surrogate pair, return the whole pair |
123 | | - return str.substring(index, index + 2); |
124 | | - } else { |
125 | | - // It's a single code point, return it |
126 | | - return str.substring(index, index + 1); |
127 | | - } |
128 | | -} |
| 116 | +const MIN_SURROGATE = 0xd800; |
| 117 | +const MAX_SURROGATE = 0xdfff; |
129 | 118 |
|
130 | | -function compareByteArrays(left: Uint8Array, right: Uint8Array): number { |
131 | | - for (let i = 0; i < left.length && i < right.length; ++i) { |
132 | | - if (left[i] !== right[i]) { |
133 | | - return primitiveComparator(left[i], right[i]); |
134 | | - } |
135 | | - } |
136 | | - return primitiveComparator(left.length, right.length); |
| 119 | +export function isSurrogate(s: string): boolean { |
| 120 | + debugAssert(s.length === 1, `s.length == ${s.length}, but expected 1`); |
| 121 | + const c = s.charCodeAt(0); |
| 122 | + return c >= MIN_SURROGATE && c <= MAX_SURROGATE; |
137 | 123 | } |
138 | 124 |
|
139 | 125 | export interface Iterable<V> { |
|
0 commit comments