Skip to content

Commit dc1ffb1

Browse files
Correct regular expression flags scanning for non-BMP characters (#58612)
Co-authored-by: Ron Buckton <ron.buckton@microsoft.com>
1 parent 8d62e2f commit dc1ffb1

File tree

6 files changed

+120
-33
lines changed

6 files changed

+120
-33
lines changed

src/compiler/scanner.ts

Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -282,16 +282,16 @@ const textToToken = new Map(Object.entries({
282282
"`": SyntaxKind.BacktickToken,
283283
}));
284284

285-
const charToRegExpFlag = new Map(Object.entries({
286-
d: RegularExpressionFlags.HasIndices,
287-
g: RegularExpressionFlags.Global,
288-
i: RegularExpressionFlags.IgnoreCase,
289-
m: RegularExpressionFlags.Multiline,
290-
s: RegularExpressionFlags.DotAll,
291-
u: RegularExpressionFlags.Unicode,
292-
v: RegularExpressionFlags.UnicodeSets,
293-
y: RegularExpressionFlags.Sticky,
294-
}));
285+
const charCodeToRegExpFlag = new Map<CharacterCodes, RegularExpressionFlags>([
286+
[CharacterCodes.d, RegularExpressionFlags.HasIndices],
287+
[CharacterCodes.g, RegularExpressionFlags.Global],
288+
[CharacterCodes.i, RegularExpressionFlags.IgnoreCase],
289+
[CharacterCodes.m, RegularExpressionFlags.Multiline],
290+
[CharacterCodes.s, RegularExpressionFlags.DotAll],
291+
[CharacterCodes.u, RegularExpressionFlags.Unicode],
292+
[CharacterCodes.v, RegularExpressionFlags.UnicodeSets],
293+
[CharacterCodes.y, RegularExpressionFlags.Sticky],
294+
]);
295295

296296
const regExpFlagToFirstAvailableLanguageVersion = new Map<RegularExpressionFlags, LanguageFeatureMinimumTarget>([
297297
[RegularExpressionFlags.HasIndices, LanguageFeatureMinimumTarget.RegularExpressionFlagsHasIndices],
@@ -394,8 +394,8 @@ function isUnicodeIdentifierPart(code: number, languageVersion: ScriptTarget | u
394394
lookupInUnicodeMap(code, unicodeES5IdentifierPart);
395395
}
396396

397-
function makeReverseMap(source: Map<string, number>): string[] {
398-
const result: string[] = [];
397+
function makeReverseMap<T>(source: Map<T, number>): T[] {
398+
const result: T[] = [];
399399
source.forEach((value, name) => {
400400
result[value] = name;
401401
});
@@ -416,16 +416,16 @@ export function stringToToken(s: string): SyntaxKind | undefined {
416416
return textToToken.get(s);
417417
}
418418

419-
const regExpFlagChars = makeReverseMap(charToRegExpFlag);
419+
const regExpFlagCharCodes = makeReverseMap(charCodeToRegExpFlag);
420420

421421
/** @internal */
422-
export function regularExpressionFlagToCharacter(f: RegularExpressionFlags): string | undefined {
423-
return regExpFlagChars[f];
422+
export function regularExpressionFlagToCharacterCode(f: RegularExpressionFlags): CharacterCodes | undefined {
423+
return regExpFlagCharCodes[f];
424424
}
425425

426426
/** @internal */
427-
export function characterToRegularExpressionFlag(c: string): RegularExpressionFlags | undefined {
428-
return charToRegExpFlag.get(c);
427+
export function characterCodeToRegularExpressionFlag(ch: CharacterCodes): RegularExpressionFlags | undefined {
428+
return charCodeToRegExpFlag.get(ch);
429429
}
430430

431431
/** @internal */
@@ -2558,27 +2558,28 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
25582558
pos++;
25592559
let regExpFlags = RegularExpressionFlags.None;
25602560
while (true) {
2561-
const ch = charCodeChecked(pos);
2561+
const ch = codePointChecked(pos);
25622562
if (ch === CharacterCodes.EOF || !isIdentifierPart(ch, languageVersion)) {
25632563
break;
25642564
}
2565+
const size = charSize(ch);
25652566
if (reportErrors) {
2566-
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
2567+
const flag = characterCodeToRegularExpressionFlag(ch);
25672568
if (flag === undefined) {
2568-
error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
2569+
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
25692570
}
25702571
else if (regExpFlags & flag) {
2571-
error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
2572+
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
25722573
}
25732574
else if (((regExpFlags | flag) & RegularExpressionFlags.AnyUnicodeMode) === RegularExpressionFlags.AnyUnicodeMode) {
2574-
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, pos, 1);
2575+
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, pos, size);
25752576
}
25762577
else {
25772578
regExpFlags |= flag;
2578-
checkRegularExpressionFlagAvailable(flag, pos);
2579+
checkRegularExpressionFlagAvailability(flag, size);
25792580
}
25802581
}
2581-
pos++;
2582+
pos += size;
25822583
}
25832584
if (reportErrors) {
25842585
scanRange(startOfRegExpBody, endOfRegExpBody - startOfRegExpBody, () => {
@@ -2843,25 +2844,26 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
28432844

28442845
function scanPatternModifiers(currFlags: RegularExpressionFlags): RegularExpressionFlags {
28452846
while (true) {
2846-
const ch = charCodeChecked(pos);
2847+
const ch = codePointChecked(pos);
28472848
if (ch === CharacterCodes.EOF || !isIdentifierPart(ch, languageVersion)) {
28482849
break;
28492850
}
2850-
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
2851+
const size = charSize(ch);
2852+
const flag = characterCodeToRegularExpressionFlag(ch);
28512853
if (flag === undefined) {
2852-
error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
2854+
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
28532855
}
28542856
else if (currFlags & flag) {
2855-
error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
2857+
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
28562858
}
28572859
else if (!(flag & RegularExpressionFlags.Modifiers)) {
2858-
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, 1);
2860+
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, size);
28592861
}
28602862
else {
28612863
currFlags |= flag;
2862-
checkRegularExpressionFlagAvailable(flag, pos);
2864+
checkRegularExpressionFlagAvailability(flag, size);
28632865
}
2864-
pos++;
2866+
pos += size;
28652867
}
28662868
return currFlags;
28672869
}
@@ -3583,10 +3585,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
35833585
});
35843586
}
35853587

3586-
function checkRegularExpressionFlagAvailable(flag: RegularExpressionFlags, pos: number) {
3588+
function checkRegularExpressionFlagAvailability(flag: RegularExpressionFlags, size: number) {
35873589
const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag) as ScriptTarget | undefined;
35883590
if (availableFrom && languageVersion < availableFrom) {
3589-
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, 1, getNameOfScriptTarget(availableFrom));
3591+
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, size, getNameOfScriptTarget(availableFrom));
35903592
}
35913593
}
35923594

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
regularExpressionWithNonBMPFlags.ts(7,23): error TS1499: Unknown regular expression flag.
2+
regularExpressionWithNonBMPFlags.ts(7,25): error TS1499: Unknown regular expression flag.
3+
regularExpressionWithNonBMPFlags.ts(7,28): error TS1499: Unknown regular expression flag.
4+
regularExpressionWithNonBMPFlags.ts(7,41): error TS1499: Unknown regular expression flag.
5+
regularExpressionWithNonBMPFlags.ts(7,43): error TS1499: Unknown regular expression flag.
6+
regularExpressionWithNonBMPFlags.ts(7,45): error TS1499: Unknown regular expression flag.
7+
8+
9+
==== regularExpressionWithNonBMPFlags.ts (6 errors) ====
10+
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
11+
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
12+
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
13+
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
14+
//
15+
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
16+
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
17+
~~
18+
!!! error TS1499: Unknown regular expression flag.
19+
~~
20+
!!! error TS1499: Unknown regular expression flag.
21+
~~
22+
!!! error TS1499: Unknown regular expression flag.
23+
~~
24+
!!! error TS1499: Unknown regular expression flag.
25+
~~
26+
!!! error TS1499: Unknown regular expression flag.
27+
~~
28+
!!! error TS1499: Unknown regular expression flag.
29+
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] ////
2+
3+
//// [regularExpressionWithNonBMPFlags.ts]
4+
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
5+
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
6+
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
7+
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
8+
//
9+
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
10+
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
11+
12+
13+
//// [regularExpressionWithNonBMPFlags.js]
14+
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
15+
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
16+
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
17+
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
18+
//
19+
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
20+
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] ////
2+
3+
=== regularExpressionWithNonBMPFlags.ts ===
4+
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
5+
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
6+
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
7+
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
8+
//
9+
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
10+
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
11+
>𝘳𝘦𝘨𝘦𝘹 : Symbol(𝘳𝘦𝘨𝘦𝘹, Decl(regularExpressionWithNonBMPFlags.ts, 6, 5))
12+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] ////
2+
3+
=== regularExpressionWithNonBMPFlags.ts ===
4+
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
5+
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
6+
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
7+
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
8+
//
9+
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
10+
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
11+
>𝘳𝘦𝘨𝘦𝘹 : RegExp
12+
> : ^^^^^^
13+
>/(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶 : RegExp
14+
> : ^^^^^^
15+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// @target: esnext
2+
3+
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
4+
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
5+
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
6+
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
7+
//
8+
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
9+
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;

0 commit comments

Comments
 (0)