Skip to content

Improve unicode escape in regex #3656

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 46 additions & 5 deletions src/com/google/javascript/jscomp/regex/RegExpTree.java
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ private RegExpTree parseCharset() {
CharRanges ieExplicits = CharRanges.EMPTY;
while (pos < limit && pattern.charAt(pos) != ']') {
char ch = pattern.charAt(pos);
char start;
int start;
if (ch == '\\') {
++pos;
char possibleGroupName = pattern.charAt(pos);
Expand All @@ -414,7 +414,7 @@ private RegExpTree parseCharset() {
start = ch;
++pos;
}
char end = start;
int end = start;
if (pos + 1 < limit && pattern.charAt(pos) == '-'
&& pattern.charAt(pos + 1) != ']') {
++pos;
Expand Down Expand Up @@ -464,15 +464,20 @@ private RegExpTree parseCharset() {
* contexts, so contexts must filter those instead.
* E.g. '\b' means a different thing inside a charset than without.
*/
private char parseEscapeChar() {
private int parseEscapeChar() {
char ch = pattern.charAt(pos++);
switch (ch) {
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'u': return parseHex(4);
case 'u':
if (flags.contains("u") && pos < limit && pattern.charAt(pos) == '{') {
return parseUnicodeEscape();
} else {
return parseHex(4);
}
case 'v': return '\u000b';
case 'x': return parseHex(2);
default:
Expand Down Expand Up @@ -599,7 +604,7 @@ private RegExpTree parseEscape() {
++pos;
return new Charset(charGroup, CharRanges.EMPTY);
}
return new Text("" + parseEscapeChar());
return new Text(new String(Character.toChars(parseEscapeChar())));
}
}

Expand Down Expand Up @@ -630,6 +635,42 @@ private char parseHex(int n) {
return (char) result;
}

private int parseUnicodeEscape() {
checkState(pattern.charAt(pos) == '{');
int start = pos++;
int result = 0;
char ch = pattern.charAt(pos);
if (ch == '}') {
throw new IllegalArgumentException("Invalid unicode escape: "
+ pattern.substring(start, ++pos));
}
while (pos < limit) {
int digit;
ch = pattern.charAt(pos++);
if ('0' <= ch && ch <= '9') {
digit = ch - '0';
} else if ('a' <= ch && ch <= 'f') {
digit = ch + (10 - 'a');
} else if ('A' <= ch && ch <= 'F') {
digit = ch + (10 - 'A');
} else if (ch == '}') {
break;
} else {
throw new IllegalArgumentException("Invalid character in unicode escape: " + ch);
}
result = (result << 4) | digit;
}
if (ch != '}') {
throw new IllegalArgumentException("Malformed unicode escape: expected '}' after "
+ pattern.substring(start, pos));
}
if (result > 0x10FFFF) {
throw new IllegalArgumentException("Unicode must not be greater than 0x10FFFF: "
+ pattern.substring(start, pos));
}
return result;
}

private boolean isRepetitionStart(char ch) {
switch (ch) {
case '?':
Expand Down
12 changes: 12 additions & 0 deletions test/com/google/javascript/jscomp/parsing/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4894,6 +4894,14 @@ public void testRegExpError() {
parseError("/\b.\\/", "Expected '/' in regular expression literal");
}

@Test
public void testRegExpUnicode() {
assertNodeEquality(parse("/\\u10fA/"), script(expr(regex("\\u10fA"))));
assertNodeEquality(parse("/\\u{10fA}/u"), script(expr(regex("\\u{10fA}", "u"))));
assertNodeEquality(parse("/\\u{1fA}/u"), script(expr(regex("\\u{1fA}", "u"))));
assertNodeEquality(parse("/\\u{10FFFF}/u"), script(expr(regex("\\u{10FFFF}", "u"))));
}

@Test
public void testRegExpFlags() {
// Various valid combinations.
Expand Down Expand Up @@ -6556,6 +6564,10 @@ private static Node regex(String regex) {
return new Node(Token.REGEXP, Node.newString(regex));
}

private static Node regex(String regex, String flag) {
return new Node(Token.REGEXP, Node.newString(regex), Node.newString(flag));
}

/**
* Verify that the given code has the given parse errors.
* @return If in IDE mode, returns a partial tree.
Expand Down
26 changes: 26 additions & 0 deletions test/com/google/javascript/jscomp/regex/RegExpTreeTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,30 @@ public void testBackreferencingTreatedAsStringIfNoGroup() {
// (?: ) in expected output serves same purpose as above test
assertRegexCompilesTo("[(?<foo>)]\\k<foo>", "", "(?:[()<>?fo]k)<foo>");
}

@Test
public void testValidUnicodeEscape() {
assertRegexCompilesTo("\\u0061", "", "a");
assertRegexCompilesTo("\\u10b1", "u", "\\u10b1");
assertRegexCompilesTo("\\u{61}", "u", "a");
assertRegexCompilesTo("\\u{10b1}", "u", "\\u10b1");
assertRegexCompilesTo("\\u{1bc}", "u", "\\u01bc");
assertRegexCompilesTo("\\u{100A3}", "u", "\\ud800\\udca3");
}

@Test
public void testInvalidUnicodeEscape() {
assertRegexThrowsExceptionThat("\\u{a012", "u")
.hasMessageThat()
.isEqualTo("Malformed unicode escape: expected '}' after {a012");
assertRegexThrowsExceptionThat("\\u{}", "u")
.hasMessageThat()
.isEqualTo("Invalid unicode escape: {}");
assertRegexThrowsExceptionThat("\\u{10za}", "u")
.hasMessageThat()
.isEqualTo("Invalid character in unicode escape: z");
assertRegexThrowsExceptionThat("\\u{FFFFFF}", "u")
.hasMessageThat()
.isEqualTo("Unicode must not be greater than 0x10FFFF: {FFFFFF}");
}
}