Skip to content

Commit f8a775f

Browse files
authored
Support supplementary CPs in Unicode identifiers (#2522)
1 parent 3135e7e commit f8a775f

File tree

6 files changed

+457
-169
lines changed

6 files changed

+457
-169
lines changed

scripts/unicode-identifier.js

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
// see https://github.com/microsoft/TypeScript/blob/main/scripts/regenerate-unicode-identifier-parts.js
2+
3+
const MAX_UNICODE_CODEPOINT = 0x10FFFF;
4+
const isStart = c => /[\p{ID_Start}\u{2118}\u{212E}\u{309B}\u{309C}]/u.test(c); // Other_ID_Start explicitly included for back compat - see http://www.unicode.org/reports/tr31/#Introduction
5+
const isPart = c => /[\p{ID_Continue}\u{00B7}\u{0387}\u{19DA}\u{1369}\u{136A}\u{136B}\u{136C}\u{136D}\u{136E}\u{136F}\u{1370}\u{1371}]/u.test(c) || isStart(c); // Likewise for Other_ID_Continue
6+
const parts = [];
7+
let partsActive = false;
8+
let startsActive = false;
9+
const starts = [];
10+
11+
// Skip 0-9 (48..57), A-Z (65..90), a-z (97..122) - checked otherwise
12+
for (let cp = 123; cp <= MAX_UNICODE_CODEPOINT; cp++) {
13+
if (isStart(String.fromCodePoint(cp)) !== startsActive) {
14+
starts.push(cp - +startsActive);
15+
startsActive = !startsActive;
16+
}
17+
if (isPart(String.fromCodePoint(cp)) !== partsActive) {
18+
parts.push(cp - +partsActive);
19+
partsActive = !partsActive;
20+
}
21+
}
22+
if (startsActive) starts.push(MAX_UNICODE_CODEPOINT);
23+
if (partsActive) parts.push(MAX_UNICODE_CODEPOINT);
24+
25+
function tablify(cps) {
26+
let sb = ["/*\n| from ... to | from ... to | from ... to | from ... to |*/"];
27+
let i = 0;
28+
while (i < cps.length) {
29+
if (!(i % 8)) sb.push("\n ");
30+
sb.push(`${cps[i++].toString().padEnd(6)}, `);
31+
}
32+
return sb.join("") + "\n";
33+
}
34+
35+
console.log(`/** Unicode ${process.versions.unicode} ID_Start/Other_ID_Start ranges */`);
36+
console.log(`const unicodeIdentifierStart: i32[] = [${tablify(starts)}];`);
37+
console.log(`const unicodeIdentifierStartMin = ${starts[0]};`);
38+
console.log(`const unicodeIdentifierStartMax = ${starts[starts.length - 1]};\n`);
39+
console.log(`/** Unicode ${process.versions.unicode} ID_Continue/Other_ID_Continue + ID_Start/Other_ID_Start ranges*/`);
40+
console.log(`const unicodeIdentifierPart: i32[] = [${tablify(parts)}];`);
41+
console.log(`const unicodeIdentifierPartMin = ${parts[0]};`);
42+
console.log(`const unicodeIdentifierPartMax = ${parts[parts.length - 1]};\n`);

src/diagnostics.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ function formatDiagnosticContext(range: Range): string {
267267
var lineSpace = " ".repeat(lineNumber.length);
268268
// Find preceeding line break
269269
while (start > 0 && !isLineBreak(text.charCodeAt(start - 1))) start--;
270-
// Skip leading whitespace
270+
// Skip leading whitespace (assume no supplementary whitespaces)
271271
while (start < len && isWhiteSpace(text.charCodeAt(start))) start++;
272272
// Find next line break
273273
while (end < len && !isLineBreak(text.charCodeAt(end))) end++;

src/tokenizer.ts

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ import {
3333
isOctal,
3434
isHexBase,
3535
isHighSurrogate,
36-
isLowSurrogate
36+
combineSurrogates,
37+
numCodeUnits
3738
} from "./util";
3839

3940
/** Named token types. */
@@ -913,11 +914,15 @@ export class Tokenizer extends DiagnosticEmitter {
913914
return Token.AT;
914915
}
915916
default: {
917+
// Unicode-aware from here on
918+
if (isHighSurrogate(c) && pos + 1 < end) {
919+
c = combineSurrogates(c, text.charCodeAt(pos + 1));
920+
}
916921
if (isIdentifierStart(c)) {
917922
let posBefore = pos;
918923
while (
919-
++pos < end &&
920-
isIdentifierPart(c = text.charCodeAt(pos))
924+
(pos += numCodeUnits(c)) < end &&
925+
isIdentifierPart(c = <i32>text.codePointAt(pos))
921926
) { /* nop */ }
922927
if (identifierHandling != IdentifierHandling.ALWAYS) {
923928
let maybeKeywordToken = tokenFromKeyword(text.substring(posBefore, pos));
@@ -935,14 +940,11 @@ export class Tokenizer extends DiagnosticEmitter {
935940
this.pos = posBefore;
936941
return Token.IDENTIFIER;
937942
} else if (isWhiteSpace(c)) {
938-
++pos;
943+
++pos; // assume no supplementary whitespaces
939944
break;
940945
}
941-
let start = pos++;
942-
if (
943-
isHighSurrogate(c) && pos < end &&
944-
isLowSurrogate(text.charCodeAt(pos))
945-
) ++pos;
946+
let start = pos;
947+
pos += numCodeUnits(c);
946948
this.error(
947949
DiagnosticCode.Invalid_character,
948950
this.range(start, pos)
@@ -1055,9 +1057,11 @@ export class Tokenizer extends DiagnosticEmitter {
10551057
var end = this.end;
10561058
var pos = this.pos;
10571059
var start = pos;
1060+
var c = <i32>text.codePointAt(pos);
1061+
assert(isIdentifierStart(c));
10581062
while (
1059-
++pos < end &&
1060-
isIdentifierPart(text.charCodeAt(pos))
1063+
(pos += numCodeUnits(c)) < end &&
1064+
isIdentifierPart(c = <i32>text.codePointAt(pos))
10611065
);
10621066
this.pos = pos;
10631067
return text.substring(start, pos);

0 commit comments

Comments
 (0)