Skip to content

Commit 2246c96

Browse files
authored
Merge pull request #15244 from Marcono1234/marcono1234/regex-flags
Java: Improve Regex flag parsing
2 parents 6c9f79c + 3edfdc5 commit 2246c96

File tree

5 files changed

+60
-15
lines changed

5 files changed

+60
-15
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: fix
3+
---
4+
* Fixed regular expressions containing flags not being parsed correctly in some cases.

java/ql/lib/semmle/code/java/regex/regex.qll

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ abstract class RegexString extends StringLiteral {
479479
private predicate flagGroupStartNoModes(int start, int end) {
480480
this.isGroupStart(start) and
481481
this.getChar(start + 1) = "?" and
482-
this.getChar(start + 2) in ["i", "m", "s", "u", "x", "U"] and
482+
this.getChar(start + 2) in ["-", "i", "d", "m", "s", "u", "x", "U"] and
483483
end = start + 2
484484
}
485485

@@ -491,15 +491,18 @@ abstract class RegexString extends StringLiteral {
491491
this.flagGroupStartNoModes(start, pos)
492492
or
493493
this.modeCharacter(start, pos - 1) and
494-
this.getChar(pos) in ["i", "m", "s", "u", "x", "U"]
494+
this.getChar(pos) in ["-", "i", "d", "m", "s", "u", "x", "U"]
495495
}
496496

497497
/**
498498
* Holds if a parse mode group is between `start` and `end`.
499499
*/
500500
private predicate flagGroupStart(int start, int end) {
501501
this.flagGroupStartNoModes(start, _) and
502-
end = max(int i | this.modeCharacter(start, i) | i + 1)
502+
// Check if this is a capturing group with flags, and therefore the `:` should be excluded
503+
exists(int maybeEnd | maybeEnd = max(int i | this.modeCharacter(start, i) | i + 1) |
504+
if this.getChar(maybeEnd) = ":" then end = maybeEnd + 1 else end = maybeEnd
505+
)
503506
}
504507

505508
/**
@@ -510,9 +513,15 @@ abstract class RegexString extends StringLiteral {
510513
* ```
511514
*/
512515
private predicate flag(string c) {
513-
exists(int pos |
514-
this.modeCharacter(_, pos) and
515-
this.getChar(pos) = c
516+
exists(int start, int pos |
517+
this.modeCharacter(start, pos) and
518+
this.getChar(pos) = c and
519+
// Ignore if flag is disabled; use `<=` to also exclude `-` itself
520+
// This does not properly handle the (contrived) case where a flag is both enabled and
521+
// disabled, e.g. `(?i-i)a+`, in which case the flag seems to acts as if it was disabled
522+
not exists(int minusPos |
523+
this.modeCharacter(start, minusPos) and this.getChar(minusPos) = "-" and minusPos <= pos
524+
)
516525
)
517526
}
518527

@@ -524,6 +533,8 @@ abstract class RegexString extends StringLiteral {
524533
exists(string c | this.flag(c) |
525534
c = "i" and result = "IGNORECASE"
526535
or
536+
c = "d" and result = "UNIXLINES"
537+
or
527538
c = "m" and result = "MULTILINE"
528539
or
529540
c = "s" and result = "DOTALL"
@@ -930,13 +941,13 @@ class Regex extends RegexString {
930941

931942
/**
932943
* Gets a mode (if any) of this regular expression. Can be any of:
933-
* DEBUG
934-
* IGNORECASE
935-
* MULTILINE
936-
* DOTALL
937-
* UNICODE
938-
* VERBOSE
939-
* UNICODECLASS
944+
* - IGNORECASE
945+
* - UNIXLINES
946+
* - MULTILINE
947+
* - DOTALL
948+
* - UNICODE
949+
* - VERBOSE
950+
* - UNICODECLASS
940951
*/
941952
string getAMode() {
942953
result != "None" and
@@ -946,7 +957,7 @@ class Regex extends RegexString {
946957
}
947958

948959
/**
949-
* Holds if this regex is used to match against a full string,
960+
* Holds if this regex is used to match against a full string,
950961
* as though it was implicitly surrounded by ^ and $.
951962
*/
952963
predicate matchesFullString() { matches_full_string = true }

java/ql/test/library-tests/regex/parser/RegexParseTests.expected

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
parseFailures
2+
modes
3+
| Test.java:17:9:17:37 | "(?i)(?=a)(?!b)(?<=c)(?<!d)+" | IGNORECASE |
4+
| Test.java:22:9:22:85 | "(?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+" | DOTALL,IGNORECASE,MULTILINE,UNICODE,UNICODECLASS,UNIXLINES,VERBOSE |
5+
| Test.java:23:9:23:24 | "(?idms-iuxU)a+" | DOTALL,IGNORECASE,MULTILINE,UNIXLINES |
26
#select
37
| Test.java:5:10:5:17 | [A-Z\\d] | [RegExpCharacterClass] |
48
| Test.java:5:10:5:19 | [A-Z\\d]++ | [RegExpPlus] |
@@ -205,3 +209,25 @@ parseFailures
205209
| Test.java:21:62:21:62 | b | [RegExpConstant,RegExpNormalChar] |
206210
| Test.java:21:64:21:64 | b | [RegExpConstant,RegExpNormalChar] |
207211
| Test.java:21:66:21:66 | b | [RegExpConstant,RegExpNormalChar] |
212+
| Test.java:22:10:22:27 | (?idmsuxU-idmsuxU) | [RegExpZeroWidthMatch] |
213+
| Test.java:22:10:22:84 | (?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+ | [RegExpSequence] |
214+
| Test.java:22:28:22:28 | a | [RegExpConstant,RegExpNormalChar] |
215+
| Test.java:22:28:22:29 | a+ | [RegExpPlus] |
216+
| Test.java:22:30:22:40 | (?-idmsuxU) | [RegExpZeroWidthMatch] |
217+
| Test.java:22:41:22:41 | b | [RegExpConstant,RegExpNormalChar] |
218+
| Test.java:22:41:22:42 | b+ | [RegExpPlus] |
219+
| Test.java:22:43:22:54 | (?idmsuxU:c) | [RegExpGroup] |
220+
| Test.java:22:53:22:53 | c | [RegExpConstant,RegExpNormalChar] |
221+
| Test.java:22:55:22:55 | d | [RegExpConstant,RegExpNormalChar] |
222+
| Test.java:22:55:22:56 | d+ | [RegExpPlus] |
223+
| Test.java:22:57:22:69 | (?-idmsuxU:e) | [RegExpGroup] |
224+
| Test.java:22:68:22:68 | e | [RegExpConstant,RegExpNormalChar] |
225+
| Test.java:22:70:22:70 | f | [RegExpConstant,RegExpNormalChar] |
226+
| Test.java:22:70:22:71 | f+ | [RegExpPlus] |
227+
| Test.java:22:72:22:82 | (?idmsuxU:) | [RegExpZeroWidthMatch] |
228+
| Test.java:22:83:22:83 | g | [RegExpConstant,RegExpNormalChar] |
229+
| Test.java:22:83:22:84 | g+ | [RegExpPlus] |
230+
| Test.java:23:10:23:21 | (?idms-iuxU) | [RegExpZeroWidthMatch] |
231+
| Test.java:23:10:23:23 | (?idms-iuxU)a+ | [RegExpSequence] |
232+
| Test.java:23:22:23:22 | a | [RegExpConstant,RegExpNormalChar] |
233+
| Test.java:23:22:23:23 | a+ | [RegExpPlus] |

java/ql/test/library-tests/regex/parser/RegexParseTests.ql

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,7 @@ string getQLClases(RegexTreeView::RegExpTerm t) {
88

99
query predicate parseFailures(Regex::Regex r, int i) { r.failedToParse(i) }
1010

11+
query predicate modes(Regex::Regex r, string modes) { modes = strictconcat(r.getAMode(), ",") }
12+
1113
from RegexTreeView::RegExpTerm t
1214
select t, getQLClases(t)

java/ql/test/library-tests/regex/parser/Test.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ class Test {
1818
"a||b|c(d|e|)f|g+",
1919
"\\018\\033\\0377\\0777\u1337+",
2020
"[|]+",
21-
"(a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+"
21+
"(a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+",
22+
"(?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+",
23+
"(?idms-iuxU)a+",
2224
};
2325

2426
void test() {

0 commit comments

Comments
 (0)