Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/ql/lib/change-notes/2023-07-20-regex-parse-modes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
category: minorAnalysis
---
* Regular expressions containing multiple parse mode flags are now interpretted correctly. For example `"(?is)abc.*"` with both the `i` and `s` flags.
29 changes: 23 additions & 6 deletions python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ class RegExp extends Expr instanceof StrConst {
private predicate group_start(int start, int end) {
this.non_capturing_group_start(start, end)
or
this.flag_group_start(start, end, _)
this.flag_group_start(start, end)
or
this.named_group_start(start, end)
or
Expand Down Expand Up @@ -679,20 +679,37 @@ class RegExp extends Expr instanceof StrConst {
end = min(int i | i > start + 4 and this.getChar(i) = "?")
}

private predicate flag_group_start(int start, int end, string c) {
/**
* Holds if a parse mode starts between `start` and `end`.
*/
private predicate flag_group_start(int start, int end) {
this.isGroupStart(start) and
this.getChar(start + 1) = "?" and
end = start + 3 and
c = this.getChar(start + 2) and
c in ["i", "L", "m", "s", "u", "x"]
this.getChar(start + 2) in ["i", "L", "m", "s", "u", "x"] and
end = start + 2
}

/**
* Holds if a parse mode group is between `start` and `end`, and includes the
* mode flag `c`. For example the following span, with mode flag `i`:
* ```
* (?i)
* ```
*/
private predicate flag_group(int start, int end, string c) {
exists(int inStart, int inEnd |
this.flag_group_start(start, inStart) and
this.groupContents(start, end, inStart, inEnd) and
this.getChar([inStart .. inEnd - 1]) = c
)
}

/**
* Gets the mode of this regular expression string if
* it is defined by a prefix.
*/
string getModeFromPrefix() {
exists(string c | this.flag_group_start(_, _, c) |
exists(string c | this.flag_group(_, _, c) |
c = "i" and result = "IGNORECASE"
or
c = "L" and result = "LOCALE"
Expand Down
1 change: 1 addition & 0 deletions python/ql/test/library-tests/regex/Characters.expected
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 22 | 23 |
| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 24 | 25 |
| (?P<name>[\\w]+)\| | 10 | 12 |
| (?m)^(?!$) | 2 | 3 |
| (?m)^(?!$) | 4 | 5 |
| (?m)^(?!$) | 8 | 9 |
| (\\033\|~{) | 1 | 5 |
Expand Down
3 changes: 1 addition & 2 deletions python/ql/test/library-tests/regex/FirstLast.expected
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
| (?P<name>[\\w]+)\| | first | 9 | 14 |
| (?P<name>[\\w]+)\| | last | 9 | 13 |
| (?P<name>[\\w]+)\| | last | 9 | 14 |
| (?m)^(?!$) | first | 4 | 5 |
| (?m)^(?!$) | first | 8 | 9 |
| (?m)^(?!$) | first | 2 | 3 |
| (?m)^(?!$) | last | 4 | 5 |
| (?m)^(?!$) | last | 8 | 9 |
| (\\033\|~{) | first | 1 | 5 |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 0 | 10 | (?:[^%]\|^) | 3 | 9 | [^%]\|^ |
| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 14 | 19 | (\\w*) | 15 | 18 | \\w* |
| (?P<name>[\\w]+)\| | 0 | 15 | (?P<name>[\\w]+) | 9 | 14 | [\\w]+ |
| (?m)^(?!$) | 0 | 4 | (?m) | 2 | 3 | m |
| (?m)^(?!$) | 5 | 10 | (?!$) | 8 | 9 | $ |
| (\\033\|~{) | 0 | 9 | (\\033\|~{) | 1 | 8 | \\033\|~{ |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 2 | 16 | (?P<txt>[^[]*) | 10 | 15 | [^[]* |
Expand Down
4 changes: 3 additions & 1 deletion python/ql/test/library-tests/regex/Regex.expected
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,11 @@
| (?P<name>[\\w]+)\| | sequence | 0 | 15 |
| (?m)^(?!$) | $ | 8 | 9 |
| (?m)^(?!$) | ^ | 4 | 5 |
| (?m)^(?!$) | empty group | 0 | 4 |
| (?m)^(?!$) | char | 2 | 3 |
| (?m)^(?!$) | empty group | 5 | 10 |
| (?m)^(?!$) | non-empty group | 0 | 4 |
| (?m)^(?!$) | sequence | 0 | 10 |
| (?m)^(?!$) | sequence | 2 | 3 |
| (?m)^(?!$) | sequence | 8 | 9 |
| (\\033\|~{) | char | 1 | 5 |
| (\\033\|~{) | char | 6 | 7 |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
| tst.py:4:20:4:43 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.py:5:20:5:43 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.py:9:20:9:30 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
| tst.py:10:20:10:53 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
| tst.py:11:20:11:51 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
| tst.py:12:20:12:58 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
| tst.py:13:20:13:58 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
| tst.py:14:20:14:62 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
| tst.py:15:20:15:48 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
| tst.py:16:20:16:66 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
| tst.py:17:20:17:53 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
| tst.py:19:20:19:54 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.py:20:20:20:62 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
| tst.py:21:20:21:161 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 3, 4. |
| tst.py:22:17:22:71 | (<[a-z\\/!$]("[^"]*"\|'[^']*'\|[^'">])*>\|<!(--.*?--\\s*)+>) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 3 and comments ending with --!> are matched with capture group 1. |
| tst.py:23:20:23:263 | <(?:(?:!--([\\w\\W]*?)-->)\|(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)\|(?:!DOCTYPE([\\w\\W]*?)>)\|(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)\|(?:\\/([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)>)\|(?:([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)((?:\\s+[^"'>]+(?:(?:"[^"]*")\|(?:'[^']*')\|[^>]*))*\|\\/\|\\s+)>)) | This regular expression only parses --> (capture group 1) and not --!> as an HTML comment end tag. |
| tst.py:12:20:12:53 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
| tst.py:13:20:13:51 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
| tst.py:14:20:14:58 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
| tst.py:15:20:15:58 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
| tst.py:16:20:16:62 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
| tst.py:17:20:17:48 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
| tst.py:18:20:18:66 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
| tst.py:19:20:19:53 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
| tst.py:21:20:21:54 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.py:22:20:22:62 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
| tst.py:23:20:23:161 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 3, 4. |
| tst.py:24:17:24:71 | (<[a-z\\/!$]("[^"]*"\|'[^']*'\|[^'">])*>\|<!(--.*?--\\s*)+>) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 3 and comments ending with --!> are matched with capture group 1. |
| tst.py:25:20:25:263 | <(?:(?:!--([\\w\\W]*?)-->)\|(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)\|(?:!DOCTYPE([\\w\\W]*?)>)\|(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)\|(?:\\/([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)>)\|(?:([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)((?:\\s+[^"'>]+(?:(?:"[^"]*")\|(?:'[^']*')\|[^>]*))*\|\\/\|\\s+)>)) | This regular expression only parses --> (capture group 1) and not --!> as an HTML comment end tag. |
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
re.compile(r"""<!--.*-->""", re.IGNORECASE | re.DOTALL), # OK - we don't care regexps that only match comments
re.compile(r"""<!--.*--!?>""", re.IGNORECASE | re.DOTALL), # OK
re.compile(r"""<!--.*--!?>""", re.IGNORECASE), # NOT OK, does not match newlines
re.compile(r"""(?is)<!--.*--!?>"""), # OK
re.compile(r"""(?i)<!--.*--!?>"""), # NOT OK, does not match newlines [NOT DETECTED]
re.compile(r"""<script.*?>(.|\s)*?<\/script[^>]*>""", re.IGNORECASE), # NOT OK - doesn't match inside the script tag
re.compile(r"""<script[^>]*?>.*?<\/script[^>]*>""", re.IGNORECASE), # NOT OK - doesn't match newlines inside the content
re.compile(r"""<script(\s|\w|=|")*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # NOT OK - does not match single quotes for attribute values
Expand All @@ -23,4 +25,4 @@
re.compile(r"""<(?:(?:!--([\w\W]*?)-->)|(?:!\[CDATA\[([\w\W]*?)\]\]>)|(?:!DOCTYPE([\w\W]*?)>)|(?:\?([^\s\/<>]+) ?([\w\W]*?)[?/]>)|(?:\/([A-Za-z][A-Za-z0-9\-_\:\.]*)>)|(?:([A-Za-z][A-Za-z0-9\-_\:\.]*)((?:\s+[^"'>]+(?:(?:"[^"]*")|(?:'[^']*')|[^>]*))*|\/|\s+)>))"""), # NOT OK - capture groups
]

doFilters(filters)
doFilters(filters)
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,5 @@
| redos.py:391:15:391:25 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. |
| unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\u00c6'. |
| unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
| unittests.py:11:20:11:28 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings starting with 's' and containing many repetitions of '\\n'. |
| unittests.py:12:21:12:29 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings starting with 'is' and containing many repetitions of '\\n'. |
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
# Treatment of line breaks
re.compile(r'(?:.|\n)*b') # No ReDoS.
re.compile(r'(?:.|\n)*b', re.DOTALL) # Has ReDoS.
re.compile(r'(?i)(?:.|\n)*b') # No ReDoS.
re.compile(r'(?s)(?:.|\n)*b') # Has ReDoS.
re.compile(r'(?is)(?:.|\n)*b') # Has ReDoS.