Test for compliance in strict and lax mode

jg-rp · jg-rp · commit 7e09eaffb442 · 2025-08-16T19:29:48.000+01:00
diff --git a/.github/workflows/tests-no-regex.yaml b/.github/workflows/tests-no-regex.yaml
@@ -0,0 +1,19 @@
+name: test-no-regex
+on: [push, pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade hatch
+      - run: hatch -e no-regex run test
diff --git a/jsonpath/function_extensions/match.py b/jsonpath/function_extensions/match.py
@@ -2,9 +2,13 @@
 
 try:
     import regex as re
+
+    REGEX_AVAILABLE = True
 except ImportError:
     import re  # type: ignore
 
+    REGEX_AVAILABLE = False
+
 from jsonpath.function_extensions import ExpressionType
 from jsonpath.function_extensions import FilterFunction
 
@@ -19,9 +23,15 @@ class Match(FilterFunction):
 
     def __call__(self, string: str, pattern: str) -> bool:
         """Return `True` if _string_ matches _pattern_, or `False` otherwise."""
+        # XXX: re.fullmatch caches compiled patterns internally, but `map_re` is not
+        # cached.
+        if REGEX_AVAILABLE:
+            try:
+                pattern = map_re(pattern)
+            except TypeError:
+                return False
+
         try:
-            # XXX: re.fullmatch caches compiled patterns internally, but `map_re` is not
-            # cached.
-            return bool(re.fullmatch(map_re(pattern), string))
+            return bool(re.fullmatch(pattern, string))
         except (TypeError, re.error):
             return False
diff --git a/jsonpath/function_extensions/search.py b/jsonpath/function_extensions/search.py
@@ -2,9 +2,13 @@
 
 try:
     import regex as re
+
+    REGEX_AVAILABLE = True
 except ImportError:
     import re  # type: ignore
 
+    REGEX_AVAILABLE = False
+
 from jsonpath.function_extensions import ExpressionType
 from jsonpath.function_extensions import FilterFunction
 
@@ -19,9 +23,15 @@ class Search(FilterFunction):
 
     def __call__(self, string: str, pattern: str) -> bool:
         """Return `True` if _string_ contains _pattern_, or `False` otherwise."""
+        # XXX: re.search caches compiled patterns internally, but `map_re` is not
+        # cached.
+        if REGEX_AVAILABLE:
+            try:
+                pattern = map_re(pattern)
+            except TypeError:
+                return False
+
         try:
-            # XXX: re.search caches compiled patterns internally, but `map_re` is not
-            # cached.
-            return bool(re.search(map_re(pattern), string))
+            return bool(re.search(pattern, string))
         except (TypeError, re.error):
             return False
diff --git a/jsonpath/lex.py b/jsonpath/lex.py
@@ -140,8 +140,11 @@ def compile_rules(self) -> Pattern[str]:
             (TOKEN_RE_PATTERN, self.re_pattern),
             (TOKEN_DOT_KEY_PROPERTY, self.dot_key_pattern),
             (TOKEN_DOT_PROPERTY, self.dot_property_pattern),
-            (TOKEN_FLOAT, r"-?\d+\.\d*(?:[eE][+-]?\d+)?"),
-            (TOKEN_INT, r"-?\d+(?P<G_EXP>[eE][+\-]?\d+)?\b"),
+            (
+                TOKEN_FLOAT,
+                r"(:?-?[0-9]+\.[0-9]+(?:[eE][+-]?[0-9]+)?)|(-?[0-9]+[eE]-[0-9]+)",
+            ),
+            (TOKEN_INT, r"-?[0-9]+(?:[eE]\+?[0-9]+)?"),
             (TOKEN_DDOT, r"\.\."),
             (TOKEN_DOT, r"\."),
             (TOKEN_AND, self.logical_and_pattern),
@@ -202,8 +205,11 @@ def compile_strict_rules(self) -> Pattern[str]:
             (TOKEN_SINGLE_QUOTE_STRING, self.single_quote_pattern),
             (TOKEN_DOT_KEY_PROPERTY, self.dot_key_pattern),
             (TOKEN_DOT_PROPERTY, self.dot_property_pattern),
-            (TOKEN_FLOAT, r"-?\d+\.\d*(?:[eE][+-]?\d+)?"),
-            (TOKEN_INT, r"-?\d+(?P<G_EXP>[eE][+\-]?\d+)?\b"),
+            (
+                TOKEN_FLOAT,
+                r"(:?-?[0-9]+\.[0-9]+(?:[eE][+-]?[0-9]+)?)|(-?[0-9]+[eE]-[0-9]+)",
+            ),
+            (TOKEN_INT, r"-?[0-9]+(?:[eE]\+?[0-9]+)?"),
             (TOKEN_DDOT, r"\.\."),
             (TOKEN_DOT, r"\."),
             (TOKEN_AND, r"&&"),
@@ -288,19 +294,6 @@ def tokenize(self, path: str) -> Iterator[Token]:  # noqa PLR0912
                     value=match.group("G_SQUOTE"),
                     index=match.start("G_SQUOTE"),
                 )
-            elif kind == TOKEN_INT:
-                if match.group("G_EXP") and match.group("G_EXP")[1] == "-":
-                    yield _token(
-                        kind=TOKEN_FLOAT,
-                        value=match.group(),
-                        index=match.start(),
-                    )
-                else:
-                    yield _token(
-                        kind=TOKEN_INT,
-                        value=match.group(),
-                        index=match.start(),
-                    )
             elif kind == TOKEN_RE_PATTERN:
                 yield _token(
                     kind=TOKEN_RE_PATTERN,
diff --git a/jsonpath/parse.py b/jsonpath/parse.py
@@ -104,6 +104,7 @@
 from .token import TOKEN_WHITESPACE
 from .token import TOKEN_WILD
 from .token import Token
+from .unescape import unescape_string
 
 if TYPE_CHECKING:
     from .env import JSONPathEnvironment
@@ -623,11 +624,23 @@ def parse_string_literal(self, stream: TokenStream) -> BaseExpression:
         return StringLiteral(value=self._decode_string_literal(stream.next()))
 
     def parse_integer_literal(self, stream: TokenStream) -> BaseExpression:
+        token = stream.next()
+        value = token.value
+
+        if self.env.strict and value.startswith("0") and len(value) > 1:
+            raise JSONPathSyntaxError("invalid integer literal", token=token)
+
         # Convert to float first to handle scientific notation.
-        return IntegerLiteral(value=int(float(stream.next().value)))
+        return IntegerLiteral(value=int(float(value)))
 
     def parse_float_literal(self, stream: TokenStream) -> BaseExpression:
-        return FloatLiteral(value=float(stream.next().value))
+        token = stream.next()
+        value = token.value
+
+        if value.startswith("0") and len(value.split(".")[0]) > 1:
+            raise JSONPathSyntaxError("invalid float literal", token=token)
+
+        return FloatLiteral(value=float(value))
 
     def parse_prefix_expression(self, stream: TokenStream) -> BaseExpression:
         token = stream.next()
@@ -839,11 +852,19 @@ def parse_filter_expression(
         return left
 
     def _decode_string_literal(self, token: Token) -> str:
+        if self.env.strict:
+            return unescape_string(
+                token.value,
+                token,
+                "'" if token.kind == TOKEN_SINGLE_QUOTE_STRING else '"',
+            )
+
         if self.env.unicode_escape:
             if token.kind == TOKEN_SINGLE_QUOTE_STRING:
                 value = token.value.replace('"', '\\"').replace("\\'", "'")
             else:
                 value = token.value
+
             try:
                 rv = json.loads(f'"{value}"')
                 assert isinstance(rv, str)
diff --git a/jsonpath/unescape.py b/jsonpath/unescape.py
@@ -0,0 +1,134 @@
+r"""Replace `\uXXXX` escape sequences with Unicode code points."""
+
+from typing import List
+from typing import Tuple
+
+from .exceptions import JSONPathSyntaxError
+from .token import Token
+
+
+def unescape_string(value: str, token: Token, quote: str) -> str:
+    """Return `value` with escape sequences replaced with Unicode code points."""
+    unescaped: List[str] = []
+    index = 0
+
+    while index < len(value):
+        ch = value[index]
+        if ch == "\\":
+            index += 1
+            _ch, index = _decode_escape_sequence(value, index, token, quote)
+            unescaped.append(_ch)
+        else:
+            _string_from_codepoint(ord(ch), token)
+            unescaped.append(ch)
+        index += 1
+    return "".join(unescaped)
+
+
+def _decode_escape_sequence(  # noqa: PLR0911
+    value: str, index: int, token: Token, quote: str
+) -> Tuple[str, int]:
+    try:
+        ch = value[index]
+    except IndexError as err:
+        raise JSONPathSyntaxError("incomplete escape sequence", token=token) from err
+
+    if ch == quote:
+        return quote, index
+    if ch == "\\":
+        return "\\", index
+    if ch == "/":
+        return "/", index
+    if ch == "b":
+        return "\x08", index
+    if ch == "f":
+        return "\x0c", index
+    if ch == "n":
+        return "\n", index
+    if ch == "r":
+        return "\r", index
+    if ch == "t":
+        return "\t", index
+    if ch == "u":
+        codepoint, index = _decode_hex_char(value, index, token)
+        return _string_from_codepoint(codepoint, token), index
+
+    raise JSONPathSyntaxError(
+        f"unknown escape sequence at index {token.index + index - 1}",
+        token=token,
+    )
+
+
+def _decode_hex_char(value: str, index: int, token: Token) -> Tuple[int, int]:
+    length = len(value)
+
+    if index + 4 >= length:
+        raise JSONPathSyntaxError(
+            f"incomplete escape sequence at index {token.index + index - 1}",
+            token=token,
+        )
+
+    index += 1  # move past 'u'
+    codepoint = _parse_hex_digits(value[index : index + 4], token)
+
+    if _is_low_surrogate(codepoint):
+        raise JSONPathSyntaxError(
+            f"unexpected low surrogate at index {token.index + index - 1}",
+            token=token,
+        )
+
+    if _is_high_surrogate(codepoint):
+        # expect a surrogate pair
+        if not (
+            index + 9 < length and value[index + 4] == "\\" and value[index + 5] == "u"
+        ):
+            raise JSONPathSyntaxError(
+                f"incomplete escape sequence at index {token.index + index - 2}",
+                token=token,
+            )
+
+        low_surrogate = _parse_hex_digits(value[index + 6 : index + 10], token)
+
+        if not _is_low_surrogate(low_surrogate):
+            raise JSONPathSyntaxError(
+                f"unexpected codepoint at index {token.index + index + 4}",
+                token=token,
+            )
+
+        codepoint = 0x10000 + (((codepoint & 0x03FF) << 10) | (low_surrogate & 0x03FF))
+
+        return (codepoint, index + 9)
+
+    return (codepoint, index + 3)
+
+
+def _parse_hex_digits(digits: str, token: Token) -> int:
+    codepoint = 0
+    for digit in digits.encode():
+        codepoint <<= 4
+        if digit >= 48 and digit <= 57:
+            codepoint |= digit - 48
+        elif digit >= 65 and digit <= 70:
+            codepoint |= digit - 65 + 10
+        elif digit >= 97 and digit <= 102:
+            codepoint |= digit - 97 + 10
+        else:
+            raise JSONPathSyntaxError(
+                "invalid \\uXXXX escape sequence",
+                token=token,
+            )
+    return codepoint
+
+
+def _string_from_codepoint(codepoint: int, token: Token) -> str:
+    if codepoint <= 0x1F:
+        raise JSONPathSyntaxError("invalid character", token=token)
+    return chr(codepoint)
+
+
+def _is_high_surrogate(codepoint: int) -> bool:
+    return codepoint >= 0xD800 and codepoint <= 0xDBFF
+
+
+def _is_low_surrogate(codepoint: int) -> bool:
+    return codepoint >= 0xDC00 and codepoint <= 0xDFFF
diff --git a/pyproject.toml b/pyproject.toml
@@ -192,4 +192,5 @@ convention = "google"
 "jsonpath/__init__.py" = ["D104"]
 "jsonpath/selectors.py" = ["D102"]
 "jsonpath/filter.py" = ["D102", "PLW1641"]
+"jsonpath/unescape.py" = ["PLR2004"]
 "tests/*" = ["D100", "D101", "D104", "D103"]
diff --git a/tests/test_compliance.py b/tests/test_compliance.py