tfeldmann · tfeldmann · Nov 16, 2022 · Nov 17, 2022 · Nov 17, 2022
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,11 @@ classifiers = [
 [tool.poetry.dependencies]
 python = "^3.5"
 
+[tool.isort]
+profile = "black"
+skip_gitignore = true
+line_length = 88
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/simplematch.py b/simplematch.py
@@ -18,7 +18,7 @@
 
 
 def register_type(name, regex, converter=str):
-    """ register a type to be available for the {value:type} matching syntax """
+    """register a type to be available for the {value:type} matching syntax"""
     cleaned = TYPE_CLEANUP_REGEX.sub("(?:", regex)
     types[name] = Type(regex=cleaned, converter=converter)
 
@@ -133,7 +133,7 @@ def _create_regex(self, pattern):
 
     @staticmethod
     def _grouplist(match):
-        """ extract unnamed match groups """
+        """extract unnamed match groups"""
         # https://stackoverflow.com/a/53385788/300783
         named = match.groupdict()
         ignored_groups = set()

diff --git a/simplematch/__init__.py b/simplematch/__init__.py
diff --git a/simplematch/converters.py b/simplematch/converters.py
@@ -0,0 +1,136 @@
+import decimal
+from ipaddress import IPv4Address
+
+
+class QuantifierMixin:
+    def __init__(self, args):
+        pass
+
+
+class Str(QuantifierMixin):
+    regex = r".*"
+
+    @staticmethod
+    def to_python(value: str) -> str:
+        return value
+
+
+class Int(QuantifierMixin):
+    regex = r"[+-]?[0-9]"
+
+    @staticmethod
+    def to_python(value: str) -> int:
+        return int(value)
+
+
+class Float:
+    regex = r"[+-]?([0-9]*[.])?[0-9]+"
+
+    @staticmethod
+    def to_python(value: str) -> float:
+        return float(value)
+
+
+class Decimal(Float):
+    @staticmethod
+    def to_python(value: str) -> decimal.Decimal:
+        return decimal.Decimal(value)
+
+
+class FourDigitYear(Int):
+    regex = "[0-9]{4}"
+
+    @staticmethod
+    def to_python(value: str) -> int:
+        return int(value)
+
+
+class Letters(Str):
+    regex = r"[a-zA-Z]+"
+
+
+class RomanNumeral(Int):
+    regex = r"M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"
+
+
+class Bitcoin(Str):
+    regex = r"(bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}"
+
+
+class Email(Str):
+    regex = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
+
+
+class Url(Str):
+    regex = (
+        r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b"
+        r"([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)"
+    )
+
+
+class IpV4:
+    regex = (
+        r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
+        r"(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}"
+    )
+
+    def to_python(self, value) -> IPv4Address:
+        return IPv4Address(value)
+
+
+class IpV6:
+    regex = (
+        r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA"
+        r"-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){"
+        r"1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3"
+        r"}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0"
+        r"-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:"
+        r"(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5"
+        r"]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0"
+        r"-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,"
+        r"3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"
+    )
+
+
+class Port:
+    regex = (
+        r"((6553[0-5])|(655[0-2][0-9])|(65[0-4][0-9]{2})|(6[0-4][0-9]{3})|"
+        r"([1-5][0-9]{4})|([0-5]{0,5})|([0-9]{1,4}))"
+    )
+
+
+class MacAddress:
+    regex = r"[a-fA-F0-9]{2}(:[a-fA-F0-9]{2}){5}"
+
+
+class SocialSecurityNumber(Str):
+    regex = r"(?!0{3})(?!6{3})[0-8]\d{2}-(?!0{2})\d{2}-(?!0{4})\d{4}"
+
+
+class CreditCard:
+    regex = (
+        r"(^4[0-9]{12}(?:[0-9]{3})?$)|(^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6]["
+        r"0-9]{2}|27[01][0-9]|2720)[0-9]{12}$)|(3[47][0-9]{13})|(^3(?:0[0-5]|[68][0-9])"
+        r"[0-9]{11}$)|(^6(?:011|5[0-9]{2})[0-9]{12}$)|(^(?:2131|1800|35\d{3})\d{11}$)"
+    )
+
+
+class LatLon:
+    regex = r"((\-?|\+?)?\d+(\.\d+)?),\s*((\-?|\+?)?\d+(\.\d+)?)"
+
+
+class SemanticVersion:
+    regex = (
+        r"(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)"
+        r"(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)"
+        r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?"
+        r"(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?"
+    )
+
+
+class JiraIssueTicket:
+    regex = r"[A-Z]{2,}-\d+"
+
+
+class Hashtag:
+    regex = r"#[^ !@#$%^&*(),.?\":{}|<>]*"
diff --git a/simplematch/py.typed b/simplematch/py.typed
diff --git a/simplematch/simplematch.py b/simplematch/simplematch.py
@@ -0,0 +1,153 @@
+"""
+simplematch
+"""
+import re
+from typing import NamedTuple, Optional
+from collections import defaultdict
+
+from . import converters as cv
+
+
+class Block(NamedTuple):
+    name: Optional[str]
+    converter: Optional[str]
+    args: Optional[str]
+
+
+def block_parser_regex(block_start_string: str = "<", block_end_string: str = ">"):
+    """
+    Assembles a regular expression which matches wildcards (`*`) and blocks
+    in the form of
+
+        `<name:converter[args]>`
+
+    Block delimiters (`<` and `>`) can be changed via the `block_start_string` and
+    `block_end_string` arguments.
+
+    Matches have three captures: (`name`, `converter`, `args`).
+    """
+    # https://regex101.com/r/xS2B04/3
+    safe_chars = r"[^:\[\]%s%s]" % (block_start_string, block_end_string)
+    regex = re.compile(
+        r"""
+        (?<!\\)\*                 # match either an unescaped wildcard `*`
+        |                         # or
+        (?:                       # a converter definition
+            (?<!\\)               # allow escaping the block start string
+            {start}               # start block
+            ({safe}+?)?           # the optional identifier name.
+            (?:                   # make the converter part optional
+                :                 # converter definition starts with `:`
+                ({safe}+?)             # converter name
+                (?:\[({safe}+?)\])?    # converter arguments
+            )?                    # end of converter part
+            {end}                 # end of block
+        )
+        """.format(
+            start=block_start_string,
+            end=block_end_string,
+            safe=safe_chars,
+        ),
+        re.VERBOSE,
+    )
+    return regex
+
+
+class Environment:
+    converters = {
+        "str": cv.Str,
+        "int": cv.Int,
+        "float": cv.Float,
+        "decimal": cv.Decimal,
+        "yyyy": cv.FourDigitYear,
+        "letters": cv.Letters,
+        "roman": cv.RomanNumeral,
+        "bitcoin": cv.Bitcoin,
+        "email": cv.Email,
+        "url": cv.Url,
+        "ipv4": cv.IpV4,
+        "ipv6": cv.IpV6,
+        "port": cv.Port,
+        "mac": cv.MacAddress,
+        "ssn": cv.SocialSecurityNumber,
+        "cc": cv.CreditCard,
+        "latlon": cv.LatLon,
+        "semver": cv.SemanticVersion,
+        "jira": cv.JiraIssueTicket,
+        "hashtag": cv.Hashtag,
+    }
+
+    def __init__(
+        self,
+        block_start_string: str,
+        block_end_string: str,
+        unnamed_key: str,
+    ):
+        self.block_parser_regex = block_parser_regex(
+            block_start_string=block_start_string,
+            block_end_string=block_end_string,
+        )
+        self.unnamed_key = unnamed_key
+        self._tmp_converters = defaultdict(list)
+
+    def _replacer(self, match: re.Match) -> str:
+        """
+        This does two things:
+        1. replaces a sm-syntax block with the regular expression given by the converter
+        2. Adds the converter in the temporary list of converters
+        """
+        # strip whitespace from within the block
+        name, _converter, _args = (
+            x.strip() if x is not None else None for x in match.groups()
+        )
+        # handle wildcard (*)
+        if name is _converter is _args is None:
+            return r".*"
+        converter = self.converters.get(_converter, cv.Str)()
+        self._tmp_converters[name or self.unnamed_key].append(converter)
+        return converter.regex
+
+    def parse_pattern(self, pattern: str):
+        self._tmp_converters.clear()
+        result = self.block_parser_regex.sub(self._replacer, pattern)
+        return result, dict(self._tmp_converters)
+
+
+DEFAULT_ENV = Environment(
+    block_start_string="<",
+    block_end_string=">",
+    unnamed_key="unnamed",
+)
+
+
+class Matcher:
+    def __init__(
+        self,
+        pattern: str = "*",
+        case_sensitive: bool = True,
+        environment=DEFAULT_ENV,
+    ):
+        self.pattern = pattern
+        self.case_sensitive = case_sensitive
+        self.environment = environment
+        self.regex, self.converters = self.environment.parse_pattern(pattern)
+        print("Regex: ", self.regex)
+        print("Conve: ", self.converters)
+
+
+Matcher("<temp : str><temp:float><temp:cc>*Test")
+Matcher("<temp:float[something]> °C wheather <planet>")
+Matcher("<:url><:url>")
+
+# txt = """
+#     \{test}
+#     {test:test[123]}
+#     <temp:float> °C
+#     <  year :   int[max=4]>-<month: int[len=4]>-<day:int[max=2]>
+#     <:float>*<:float><:float[ len = 2, case_sensitive]>
+#     <:float>\*<name>*\<str>
+#     <planet><test:end>
+#     """
+
+# for x in DEFAULT_ENV.parse(txt):
+#     print(x)
diff --git a/test_simplematch.py b/test_simplematch.py
@@ -78,7 +78,7 @@ def test_simple_matching():
     # should return None object if no match
     assert sm.match("{folder}/{filename}?{params}", "hello.js?p=1") is None
 
-    # should match strings with . (dot) and ? (question mart) sights
+    # should match strings with . (dot) and ? (question mark) signs
     assert sm.match("{folder}/{filename}?{params}", "home/hello.js?p=1") == dict(
         folder="home", filename="hello.js", params="p=1"
     )
@@ -240,6 +240,7 @@ def test_type_ccard(inp, result):
         ("https://xkcd.com/2293/", True),
         ("https://this-shouldn't.match@example.com", False),
         ("http://www.example.com/", True),
+        ("http:/ww.example.com/", False),
     ),
 )
 def test_type_url(inp, is_url):