Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ classifiers = [
[tool.poetry.dependencies]
python = "^3.5"

[tool.isort]
profile = "black"
skip_gitignore = true
line_length = 88

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
4 changes: 2 additions & 2 deletions simplematch.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@


def register_type(name, regex, converter=str):
""" register a type to be available for the {value:type} matching syntax """
"""register a type to be available for the {value:type} matching syntax"""
cleaned = TYPE_CLEANUP_REGEX.sub("(?:", regex)
types[name] = Type(regex=cleaned, converter=converter)

Expand Down Expand Up @@ -133,7 +133,7 @@ def _create_regex(self, pattern):

@staticmethod
def _grouplist(match):
""" extract unnamed match groups """
"""extract unnamed match groups"""
# https://stackoverflow.com/a/53385788/300783
named = match.groupdict()
ignored_groups = set()
Expand Down
Empty file added simplematch/__init__.py
Empty file.
136 changes: 136 additions & 0 deletions simplematch/converters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import decimal
from ipaddress import IPv4Address


class QuantifierMixin:
def __init__(self, args):
pass


class Str(QuantifierMixin):
regex = r".*"

@staticmethod
def to_python(value: str) -> str:
return value


class Int(QuantifierMixin):
regex = r"[+-]?[0-9]"

@staticmethod
def to_python(value: str) -> int:
return int(value)


class Float:
regex = r"[+-]?([0-9]*[.])?[0-9]+"

@staticmethod
def to_python(value: str) -> float:
return float(value)


class Decimal(Float):
@staticmethod
def to_python(value: str) -> decimal.Decimal:
return decimal.Decimal(value)


class FourDigitYear(Int):
regex = "[0-9]{4}"

@staticmethod
def to_python(value: str) -> int:
return int(value)


class Letters(Str):
regex = r"[a-zA-Z]+"


class RomanNumeral(Int):
regex = r"M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"


class Bitcoin(Str):
regex = r"(bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}"


class Email(Str):
regex = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"


class Url(Str):
regex = (
r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b"
r"([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)"
)


class IpV4:
regex = (
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
r"(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}"
)

def to_python(self, value) -> IPv4Address:
return IPv4Address(value)


class IpV6:
regex = (
r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA"
r"-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){"
r"1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3"
r"}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0"
r"-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:"
r"(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5"
r"]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0"
r"-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,"
r"3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"
)


class Port:
regex = (
r"((6553[0-5])|(655[0-2][0-9])|(65[0-4][0-9]{2})|(6[0-4][0-9]{3})|"
r"([1-5][0-9]{4})|([0-5]{0,5})|([0-9]{1,4}))"
)


class MacAddress:
regex = r"[a-fA-F0-9]{2}(:[a-fA-F0-9]{2}){5}"


class SocialSecurityNumber(Str):
regex = r"(?!0{3})(?!6{3})[0-8]\d{2}-(?!0{2})\d{2}-(?!0{4})\d{4}"


class CreditCard:
regex = (
r"(^4[0-9]{12}(?:[0-9]{3})?$)|(^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6]["
r"0-9]{2}|27[01][0-9]|2720)[0-9]{12}$)|(3[47][0-9]{13})|(^3(?:0[0-5]|[68][0-9])"
r"[0-9]{11}$)|(^6(?:011|5[0-9]{2})[0-9]{12}$)|(^(?:2131|1800|35\d{3})\d{11}$)"
)


class LatLon:
regex = r"((\-?|\+?)?\d+(\.\d+)?),\s*((\-?|\+?)?\d+(\.\d+)?)"


class SemanticVersion:
regex = (
r"(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)"
r"(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)"
r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?"
r"(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?"
)


class JiraIssueTicket:
regex = r"[A-Z]{2,}-\d+"


class Hashtag:
regex = r"#[^ !@#$%^&*(),.?\":{}|<>]*"
Empty file added simplematch/py.typed
Empty file.
153 changes: 153 additions & 0 deletions simplematch/simplematch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""
simplematch
"""
import re
from typing import NamedTuple, Optional
from collections import defaultdict

from . import converters as cv


class Block(NamedTuple):
name: Optional[str]
converter: Optional[str]
args: Optional[str]


def block_parser_regex(block_start_string: str = "<", block_end_string: str = ">"):
"""
Assembles a regular expression which matches wildcards (`*`) and blocks
in the form of

`<name:converter[args]>`

Block delimiters (`<` and `>`) can be changed via the `block_start_string` and
`block_end_string` arguments.

Matches have three captures: (`name`, `converter`, `args`).
"""
# https://regex101.com/r/xS2B04/3
safe_chars = r"[^:\[\]%s%s]" % (block_start_string, block_end_string)
regex = re.compile(
r"""
(?<!\\)\* # match either an unescaped wildcard `*`
| # or
(?: # a converter definition
(?<!\\) # allow escaping the block start string
{start} # start block
({safe}+?)? # the optional identifier name.
(?: # make the converter part optional
: # converter definition starts with `:`
({safe}+?) # converter name
(?:\[({safe}+?)\])? # converter arguments
)? # end of converter part
{end} # end of block
)
""".format(
start=block_start_string,
end=block_end_string,
safe=safe_chars,
),
re.VERBOSE,
)
return regex


class Environment:
converters = {
"str": cv.Str,
"int": cv.Int,
"float": cv.Float,
"decimal": cv.Decimal,
"yyyy": cv.FourDigitYear,
"letters": cv.Letters,
"roman": cv.RomanNumeral,
"bitcoin": cv.Bitcoin,
"email": cv.Email,
"url": cv.Url,
"ipv4": cv.IpV4,
"ipv6": cv.IpV6,
"port": cv.Port,
"mac": cv.MacAddress,
"ssn": cv.SocialSecurityNumber,
"cc": cv.CreditCard,
"latlon": cv.LatLon,
"semver": cv.SemanticVersion,
"jira": cv.JiraIssueTicket,
"hashtag": cv.Hashtag,
}

def __init__(
self,
block_start_string: str,
block_end_string: str,
unnamed_key: str,
):
self.block_parser_regex = block_parser_regex(
block_start_string=block_start_string,
block_end_string=block_end_string,
)
self.unnamed_key = unnamed_key
self._tmp_converters = defaultdict(list)

def _replacer(self, match: re.Match) -> str:
"""
This does two things:
1. replaces a sm-syntax block with the regular expression given by the converter
2. Adds the converter in the temporary list of converters
"""
# strip whitespace from within the block
name, _converter, _args = (
x.strip() if x is not None else None for x in match.groups()
)
# handle wildcard (*)
if name is _converter is _args is None:
return r".*"
converter = self.converters.get(_converter, cv.Str)()
self._tmp_converters[name or self.unnamed_key].append(converter)
return converter.regex

def parse_pattern(self, pattern: str):
self._tmp_converters.clear()
result = self.block_parser_regex.sub(self._replacer, pattern)
return result, dict(self._tmp_converters)


DEFAULT_ENV = Environment(
block_start_string="<",
block_end_string=">",
unnamed_key="unnamed",
)


class Matcher:
def __init__(
self,
pattern: str = "*",
case_sensitive: bool = True,
environment=DEFAULT_ENV,
):
self.pattern = pattern
self.case_sensitive = case_sensitive
self.environment = environment
self.regex, self.converters = self.environment.parse_pattern(pattern)
print("Regex: ", self.regex)
print("Conve: ", self.converters)


Matcher("<temp : str><temp:float><temp:cc>*Test")
Matcher("<temp:float[something]> °C wheather <planet>")
Matcher("<:url><:url>")

# txt = """
# \{test}
# {test:test[123]}
# <temp:float> °C
# < year : int[max=4]>-<month: int[len=4]>-<day:int[max=2]>
# <:float>*<:float><:float[ len = 2, case_sensitive]>
# <:float>\*<name>*\<str>
# <planet><test:end>
# """

# for x in DEFAULT_ENV.parse(txt):
# print(x)
3 changes: 2 additions & 1 deletion test_simplematch.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_simple_matching():
# should return None object if no match
assert sm.match("{folder}/{filename}?{params}", "hello.js?p=1") is None

# should match strings with . (dot) and ? (question mart) sights
# should match strings with . (dot) and ? (question mark) signs
assert sm.match("{folder}/{filename}?{params}", "home/hello.js?p=1") == dict(
folder="home", filename="hello.js", params="p=1"
)
Expand Down Expand Up @@ -240,6 +240,7 @@ def test_type_ccard(inp, result):
("https://xkcd.com/2293/", True),
("https://this-shouldn't.match@example.com", False),
("http://www.example.com/", True),
("http:/ww.example.com/", False),
),
)
def test_type_url(inp, is_url):
Expand Down