From fdcac82aacacf603fc0c2e3646ec50bfc9b42163 Mon Sep 17 00:00:00 2001
From: Rik <gitaarik@posteo.net>
Date: Thu, 11 Feb 2021 19:39:58 -0500
Subject: [PATCH] Improved javascript regex regocnizing for extracting js
 messages

---
 CHANGES.rst                       |  9 ++++++
 babel/messages/jslexer.py         | 52 ++++++++++++++++++++++++++++++-
 tests/messages/test_js_extract.py | 29 +++++++++++++++++
 3 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 6e84fe92b..e1c4eb3ce 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,6 +1,15 @@
 Babel Changelog
 ===============
 
+Next version
+--------------
+
+Bugfixes
+~~~~~~~~
+
+* Regex for parsing JavaScript regexes improved. Before this, the lexer couldn't recognize certain regexes,
+  breaking the parsing of JS files.
+
 Version 2.9.1
 -------------
 
diff --git a/babel/messages/jslexer.py b/babel/messages/jslexer.py
index ef30c993e..efa6a1419 100644
--- a/babel/messages/jslexer.py
+++ b/babel/messages/jslexer.py
@@ -24,7 +24,57 @@
 name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
 dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
 division_re = re.compile(r'/=?')
-regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
+
+regex_re = re.compile(
+    r'''
+
+        # Opening slash of the regex
+        /
+
+        (?:
+
+            # 1) Blackslashed character
+            #
+            # Match a backslash `\` and then it's following character, allowing
+            # to blackslash the `/` for example.
+            (?:\\.)?
+
+            |
+
+            # 2) Regex character class `[a-z]`
+            #
+            # Match regex character class, like `[a-z]`. Inside a character
+            # class, a `/` character may appear, which does not close the
+            # regex. Therefore we allow it here inside a character class.
+            \[
+                (?:
+                    [^\]]*
+                    |
+                    \\\]
+                )*
+            \]
+
+            |
+
+            # 3) Other characters
+            #
+            # Match anything except a closing slash `/`, a backslash `\`, or a
+            # opening bracket `[`. Those last two will be handled by the other
+            # matchers.
+            [^/\\\[]*
+
+        )*
+
+        # Closing slash of the regex
+        /
+
+        # regex flags
+        [a-zA-Z]*
+
+    ''',
+    re.DOTALL + re.VERBOSE
+)
+
 line_re = re.compile(r'(\r\n|\n|\r)')
 line_join_re = re.compile(r'\\' + line_re.pattern)
 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
diff --git a/tests/messages/test_js_extract.py b/tests/messages/test_js_extract.py
index 73b16a934..28b2a292e 100644
--- a/tests/messages/test_js_extract.py
+++ b/tests/messages/test_js_extract.py
@@ -151,3 +151,32 @@ def test_template_string_tag_usage():
     )
 
     assert messages == [(1, 'Tag template, wow', [], None)]
+
+
+def test_regex_with_non_escaped_slash():
+    """
+    Test if regexes with non-escaped slashes are parsed correctly.
+
+    A Javascript regex that is opened and closed with slashes, allows a
+    non-escaped slash inside a character class, like: [/]. In the past, the
+    babel JS lexer thought this closed the regex.
+
+    If a " followed the falsly closing /, then babel thought a javascript
+    string was started, and would stretch it to the next quote. This caused the
+    bug.
+
+    The regex in babel/messages/jslexer.py now covers this scenario, and this
+    unit test makes sure it works.
+    """
+    buf = BytesIO(b"""\
+msg1 = _('message 1')
+regex1 = /[/]"/
+msg2 = _('message 2')
+fake_closing_quote = '"'
+    """)
+    messages = \
+        list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS,
+                             [], {}))
+
+    assert messages == [(1, 'message 1', [], None),
+                        (3, 'message 2', [], None)]