From fdcac82aacacf603fc0c2e3646ec50bfc9b42163 Mon Sep 17 00:00:00 2001 From: Rik Date: Thu, 11 Feb 2021 19:39:58 -0500 Subject: [PATCH] Improved javascript regex regocnizing for extracting js messages --- CHANGES.rst | 9 ++++++ babel/messages/jslexer.py | 52 ++++++++++++++++++++++++++++++- tests/messages/test_js_extract.py | 29 +++++++++++++++++ 3 files changed, 89 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 6e84fe92b..e1c4eb3ce 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,15 @@ Babel Changelog =============== +Next version +-------------- + +Bugfixes +~~~~~~~~ + +* Regex for parsing JavaScript regexes improved. Before this, the lexer couldn't recognize certain regexes, + breaking the parsing of JS files. + Version 2.9.1 ------------- diff --git a/babel/messages/jslexer.py b/babel/messages/jslexer.py index ef30c993e..efa6a1419 100644 --- a/babel/messages/jslexer.py +++ b/babel/messages/jslexer.py @@ -24,7 +24,57 @@ name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE) dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE) division_re = re.compile(r'/=?') -regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL) + +regex_re = re.compile( + r''' + + # Opening slash of the regex + / + + (?: + + # 1) Blackslashed character + # + # Match a backslash `\` and then it's following character, allowing + # to blackslash the `/` for example. + (?:\\.)? + + | + + # 2) Regex character class `[a-z]` + # + # Match regex character class, like `[a-z]`. Inside a character + # class, a `/` character may appear, which does not close the + # regex. Therefore we allow it here inside a character class. + \[ + (?: + [^\]]* + | + \\\] + )* + \] + + | + + # 3) Other characters + # + # Match anything except a closing slash `/`, a backslash `\`, or a + # opening bracket `[`. Those last two will be handled by the other + # matchers. + [^/\\\[]* + + )* + + # Closing slash of the regex + / + + # regex flags + [a-zA-Z]* + + ''', + re.DOTALL + re.VERBOSE +) + line_re = re.compile(r'(\r\n|\n|\r)') line_join_re = re.compile(r'\\' + line_re.pattern) uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}') diff --git a/tests/messages/test_js_extract.py b/tests/messages/test_js_extract.py index 73b16a934..28b2a292e 100644 --- a/tests/messages/test_js_extract.py +++ b/tests/messages/test_js_extract.py @@ -151,3 +151,32 @@ def test_template_string_tag_usage(): ) assert messages == [(1, 'Tag template, wow', [], None)] + + +def test_regex_with_non_escaped_slash(): + """ + Test if regexes with non-escaped slashes are parsed correctly. + + A Javascript regex that is opened and closed with slashes, allows a + non-escaped slash inside a character class, like: [/]. In the past, the + babel JS lexer thought this closed the regex. + + If a " followed the falsly closing /, then babel thought a javascript + string was started, and would stretch it to the next quote. This caused the + bug. + + The regex in babel/messages/jslexer.py now covers this scenario, and this + unit test makes sure it works. + """ + buf = BytesIO(b"""\ +msg1 = _('message 1') +regex1 = /[/]"/ +msg2 = _('message 2') +fake_closing_quote = '"' + """) + messages = \ + list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, + [], {})) + + assert messages == [(1, 'message 1', [], None), + (3, 'message 2', [], None)]