Improved javascript template string expression extracting (#939)

johanneswilm · gitaarik · akx · web-flow · commit d425f86a08d5 · 2023-01-06T22:18:35.000+02:00
Co-authored-by: Rik &lt;gitaarik@posteo.net&gt;
Co-authored-by: Aarni Koskela &lt;akx@iki.fi&gt;
diff --git a/babel/messages/extract.py b/babel/messages/extract.py
@@ -16,9 +16,10 @@
     :license: BSD, see LICENSE for more details.
 """
 import ast
+import io
 import os
-from os.path import relpath
 import sys
+from os.path import relpath
 from tokenize import generate_tokens, COMMENT, NAME, OP, STRING
 
 from babel.util import parse_encoding, parse_future_flags, pathmatch
@@ -532,7 +533,7 @@ def _parse_python_string(value, encoding, future_flags):
     return None
 
 
-def extract_javascript(fileobj, keywords, comment_tags, options):
+def extract_javascript(fileobj, keywords, comment_tags, options, lineno=1):
     """Extract messages from JavaScript source code.
 
     :param fileobj: the seekable, file-like object the messages should be
@@ -544,7 +545,11 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
     :param options: a dictionary of additional options (optional)
                     Supported options are:
                     * `jsx` -- set to false to disable JSX/E4X support.
-                    * `template_string` -- set to false to disable ES6 template string support.
+                    * `template_string` -- if `True`, supports gettext(`key`)
+                    * `parse_template_string` -- if `True` will parse the
+                                                 contents of javascript
+                                                 template strings.
+    :param lineno: line number offset (for parsing embedded fragments)
     """
     from babel.messages.jslexer import Token, tokenize, unquote_string
     funcname = message_lineno = None
@@ -556,12 +561,12 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
     last_token = None
     call_stack = -1
     dotted = any('.' in kw for kw in keywords)
-
     for token in tokenize(
         fileobj.read().decode(encoding),
         jsx=options.get("jsx", True),
         template_string=options.get("template_string", True),
-        dotted=dotted
+        dotted=dotted,
+        lineno=lineno
     ):
         if (  # Turn keyword`foo` expressions into keyword("foo") calls:
             funcname and  # have a keyword...
@@ -573,7 +578,11 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
             call_stack = 0
             token = Token('operator', ')', token.lineno)
 
-        if token.type == 'operator' and token.value == '(':
+        if options.get('parse_template_string') and not funcname and token.type == 'template_string':
+            for item in parse_template_string(token.value, keywords, comment_tags, options, token.lineno):
+                yield item
+
+        elif token.type == 'operator' and token.value == '(':
             if funcname:
                 message_lineno = token.lineno
                 call_stack += 1
@@ -665,3 +674,41 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
             funcname = token.value
 
         last_token = token
+
+
+def parse_template_string(template_string, keywords, comment_tags, options, lineno=1):
+    """Parse JavaScript template string.
+
+    :param template_string: the template string to be parsed
+    :param keywords: a list of keywords (i.e. function names) that should be
+                     recognized as translation functions
+    :param comment_tags: a list of translator tags to search for and include
+                         in the results
+    :param options: a dictionary of additional options (optional)
+    :param lineno: starting line number (optional)
+    """
+    from babel.messages.jslexer import line_re
+    prev_character = None
+    level = 0
+    inside_str = False
+    expression_contents = ''
+    for character in template_string[1:-1]:
+        if not inside_str and character in ('"', "'", '`'):
+            inside_str = character
+        elif inside_str == character and prev_character != r'\\':
+            inside_str = False
+        if level:
+            expression_contents += character
+        if not inside_str:
+            if character == '{' and prev_character == '$':
+                level += 1
+            elif level and character == '}':
+                level -= 1
+                if level == 0 and expression_contents:
+                    expression_contents = expression_contents[0:-1]
+                    fake_file_obj = io.BytesIO(expression_contents.encode())
+                    for item in extract_javascript(fake_file_obj, keywords, comment_tags, options, lineno):
+                        yield item
+                    lineno += len(line_re.findall(expression_contents))
+                    expression_contents = ''
+        prev_character = character
diff --git a/babel/messages/jslexer.py b/babel/messages/jslexer.py
@@ -151,17 +151,17 @@ def unquote_string(string):
     return u''.join(result)
 
 
-def tokenize(source, jsx=True, dotted=True, template_string=True):
+def tokenize(source, jsx=True, dotted=True, template_string=True, lineno=1):
     """
     Tokenize JavaScript/JSX source.  Returns a generator of tokens.
 
     :param jsx: Enable (limited) JSX parsing.
     :param dotted: Read dotted names as single name token.
     :param template_string: Support ES6 template strings
+    :param lineno: starting line number (optional)
     """
     may_divide = False
     pos = 0
-    lineno = 1
     end = len(source)
     rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
 
diff --git a/tests/messages/test_js_extract.py b/tests/messages/test_js_extract.py
@@ -150,3 +150,42 @@ def test_template_string_tag_usage():
     )
 
     assert messages == [(1, 'Tag template, wow', [], None)]
+
+
+def test_inside_template_string():
+    buf = BytesIO(b"const msg = `${gettext('Hello')} ${user.name}`")
+    messages = list(
+        extract.extract('javascript', buf, {"gettext": None}, [], {'parse_template_string': True})
+    )
+
+    assert messages == [(1, 'Hello', [], None)]
+
+
+def test_inside_template_string_with_linebreaks():
+    buf = BytesIO(b"""\
+const userName = gettext('Username')
+const msg = `${
+gettext('Hello')
+} ${userName} ${
+gettext('Are you having a nice day?')
+}`
+const msg2 = `${
+gettext('Howdy')
+} ${userName} ${
+gettext('Are you doing ok?')
+}`
+""")
+    messages = list(
+        extract.extract('javascript', buf, {"gettext": None}, [], {'parse_template_string': True})
+    )
+
+    assert messages == [(1, 'Username', [], None), (3, 'Hello', [], None), (5, 'Are you having a nice day?', [], None), (8, 'Howdy', [], None), (10, 'Are you doing ok?', [], None)]
+
+
+def test_inside_nested_template_string():
+    buf = BytesIO(b"const msg = `${gettext('Greetings!')} ${ evening ? `${user.name}: ${gettext('This is a lovely evening.')}` : `${gettext('The day is really nice!')} ${user.name}`}`")
+    messages = list(
+        extract.extract('javascript', buf, {"gettext": None}, [], {'parse_template_string': True})
+    )
+
+    assert messages == [(1, 'Greetings!', [], None), (1, 'This is a lovely evening.', [], None), (1, 'The day is really nice!', [], None)]