Skip to content

Java: Improve Regex flag parsing #15244

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions java/ql/lib/change-notes/2024-01-06-regex-flag-parsing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
category: fix
---
* Fixed regular expressions containing flags not being parsed correctly in some cases.
39 changes: 25 additions & 14 deletions java/ql/lib/semmle/code/java/regex/regex.qll
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ abstract class RegexString extends StringLiteral {
private predicate flagGroupStartNoModes(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
this.getChar(start + 2) in ["i", "m", "s", "u", "x", "U"] and
this.getChar(start + 2) in ["-", "i", "d", "m", "s", "u", "x", "U"] and
end = start + 2
}

Expand All @@ -491,15 +491,18 @@ abstract class RegexString extends StringLiteral {
this.flagGroupStartNoModes(start, pos)
or
this.modeCharacter(start, pos - 1) and
this.getChar(pos) in ["i", "m", "s", "u", "x", "U"]
this.getChar(pos) in ["-", "i", "d", "m", "s", "u", "x", "U"]
}

/**
* Holds if a parse mode group is between `start` and `end`.
*/
private predicate flagGroupStart(int start, int end) {
this.flagGroupStartNoModes(start, _) and
end = max(int i | this.modeCharacter(start, i) | i + 1)
// Check if this is a capturing group with flags, and therefore the `:` should be excluded
exists(int maybeEnd | maybeEnd = max(int i | this.modeCharacter(start, i) | i + 1) |
if this.getChar(maybeEnd) = ":" then end = maybeEnd + 1 else end = maybeEnd
)
}

/**
Expand All @@ -510,9 +513,15 @@ abstract class RegexString extends StringLiteral {
* ```
*/
private predicate flag(string c) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably out of scope, but this predicate might not behave as desired when the flags of a group don't affect the whole pattern, e.g. for a(?i:b) it would have the flag i as result even though this only affects the b.

exists(int pos |
this.modeCharacter(_, pos) and
this.getChar(pos) = c
exists(int start, int pos |
this.modeCharacter(start, pos) and
this.getChar(pos) = c and
// Ignore if flag is disabled; use `<=` to also exclude `-` itself
// This does not properly handle the (contrived) case where a flag is both enabled and
// disabled, e.g. `(?i-i)a+`, in which case the flag seems to acts as if it was disabled
not exists(int minusPos |
this.modeCharacter(start, minusPos) and this.getChar(minusPos) = "-" and minusPos <= pos
)
)
}

Expand All @@ -524,6 +533,8 @@ abstract class RegexString extends StringLiteral {
exists(string c | this.flag(c) |
c = "i" and result = "IGNORECASE"
or
c = "d" and result = "UNIXLINES"
or
c = "m" and result = "MULTILINE"
or
c = "s" and result = "DOTALL"
Expand Down Expand Up @@ -930,13 +941,13 @@ class Regex extends RegexString {

/**
* Gets a mode (if any) of this regular expression. Can be any of:
* DEBUG
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed this DEBUG because it does not seem to be a possible value.

* IGNORECASE
* MULTILINE
* DOTALL
* UNICODE
* VERBOSE
* UNICODECLASS
* - IGNORECASE
* - UNIXLINES
* - MULTILINE
* - DOTALL
* - UNICODE
* - VERBOSE
* - UNICODECLASS
*/
string getAMode() {
result != "None" and
Expand All @@ -946,7 +957,7 @@ class Regex extends RegexString {
}

/**
* Holds if this regex is used to match against a full string,
* Holds if this regex is used to match against a full string,
* as though it was implicitly surrounded by ^ and $.
*/
predicate matchesFullString() { matches_full_string = true }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
parseFailures
modes
| Test.java:17:9:17:37 | "(?i)(?=a)(?!b)(?<=c)(?<!d)+" | IGNORECASE |
| Test.java:22:9:22:85 | "(?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+" | DOTALL,IGNORECASE,MULTILINE,UNICODE,UNICODECLASS,UNIXLINES,VERBOSE |
| Test.java:23:9:23:24 | "(?idms-iuxU)a+" | DOTALL,IGNORECASE,MULTILINE,UNIXLINES |
#select
| Test.java:5:10:5:17 | [A-Z\\d] | [RegExpCharacterClass] |
| Test.java:5:10:5:19 | [A-Z\\d]++ | [RegExpPlus] |
Expand Down Expand Up @@ -205,3 +209,25 @@ parseFailures
| Test.java:21:62:21:62 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:64:21:64 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:21:66:21:66 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:10:22:27 | (?idmsuxU-idmsuxU) | [RegExpZeroWidthMatch] |
| Test.java:22:10:22:84 | (?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+ | [RegExpSequence] |
| Test.java:22:28:22:28 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:28:22:29 | a+ | [RegExpPlus] |
| Test.java:22:30:22:40 | (?-idmsuxU) | [RegExpZeroWidthMatch] |
| Test.java:22:41:22:41 | b | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:41:22:42 | b+ | [RegExpPlus] |
| Test.java:22:43:22:54 | (?idmsuxU:c) | [RegExpGroup] |
| Test.java:22:53:22:53 | c | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:55:22:55 | d | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:55:22:56 | d+ | [RegExpPlus] |
| Test.java:22:57:22:69 | (?-idmsuxU:e) | [RegExpGroup] |
| Test.java:22:68:22:68 | e | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:70:22:70 | f | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:70:22:71 | f+ | [RegExpPlus] |
| Test.java:22:72:22:82 | (?idmsuxU:) | [RegExpZeroWidthMatch] |
| Test.java:22:83:22:83 | g | [RegExpConstant,RegExpNormalChar] |
| Test.java:22:83:22:84 | g+ | [RegExpPlus] |
| Test.java:23:10:23:21 | (?idms-iuxU) | [RegExpZeroWidthMatch] |
| Test.java:23:10:23:23 | (?idms-iuxU)a+ | [RegExpSequence] |
| Test.java:23:22:23:22 | a | [RegExpConstant,RegExpNormalChar] |
| Test.java:23:22:23:23 | a+ | [RegExpPlus] |
2 changes: 2 additions & 0 deletions java/ql/test/library-tests/regex/parser/RegexParseTests.ql
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@ string getQLClases(RegexTreeView::RegExpTerm t) {

query predicate parseFailures(Regex::Regex r, int i) { r.failedToParse(i) }

query predicate modes(Regex::Regex r, string modes) { modes = strictconcat(r.getAMode(), ",") }

from RegexTreeView::RegExpTerm t
select t, getQLClases(t)
4 changes: 3 additions & 1 deletion java/ql/test/library-tests/regex/parser/Test.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ class Test {
"a||b|c(d|e|)f|g+",
"\\018\\033\\0377\\0777\u1337+",
"[|]+",
"(a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+"
"(a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+",
"(?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+",
"(?idms-iuxU)a+",
};

void test() {
Expand Down