Replace linkify-it-py dependency with a GFM autolink plugin for markd…

…own-it (#48)
hukkin · Dec 12, 2024 · 16f26ae · 16f26ae
1 parent ec9e66f
commit 16f26ae
Show file tree

Hide file tree

Showing 9 changed files with 600 additions and 25 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -38,7 +38,7 @@ jobs:
       run: |
         pip install pre-commit mypy==1.11.2
         pre-commit run --all-files
-        mypy .
+        mypy src/ tests/
 
   pypi-publish:
     # Only publish if all other jobs succeed

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ license = { file = "LICENSE" }
 requires-python = ">=3.9"
 dependencies = [
     'mdformat >=0.7.5,<0.8.0',
-    'markdown-it-py[linkify]',  # Let `mdformat` choose version boundaries for `markdown-it-py`
+    'markdown-it-py',  # Let `mdformat` choose version boundaries for `markdown-it-py`
     'mdit-py-plugins >=0.2.0',
     'mdformat-tables >=0.4.0',
 ]
@@ -29,7 +29,7 @@ keywords = ["mdformat", "markdown", "formatter", "gfm"]
 "Homepage" = "https://github.com/hukkin/mdformat-gfm"
 
 [project.entry-points."mdformat.parser_extension"]
-"gfm" = "mdformat_gfm.plugin"
+"gfm" = "mdformat_gfm._mdformat_plugin"
 
 
 [tool.tox]
@@ -56,7 +56,7 @@ deps = [
     "-r tests/requirements.txt",
     "mypy ==1.11.2",
 ]
-commands = [["mypy", { replace = "posargs", default = ["."], extend = true }]]
+commands = [["mypy", { replace = "posargs", default = ["src/", "tests/"], extend = true }]]
 
 
 [tool.isort]

diff --git a/src/mdformat_gfm/_gfm.py b/src/mdformat_gfm/_gfm.py
@@ -0,0 +1,6 @@
+# Whitespace characters, as specified in
+# https://github.github.com/gfm/#whitespace-character
+# (spec version 0.29-gfm (2019-04-06)
+WHITESPACE = frozenset(" \t\n\v\f\r")
+
+BEFORE_AUTOLINK_CHARS = WHITESPACE | {"*", "_", "~", "("}
diff --git a/src/mdformat_gfm/plugin.py → src/mdformat_gfm/_mdformat_plugin.py b/src/mdformat_gfm/plugin.py → src/mdformat_gfm/_mdformat_plugin.py
@@ -5,14 +5,12 @@
 from mdformat.renderer import DEFAULT_RENDERERS, RenderContext, RenderTreeNode
 from mdit_py_plugins.tasklists import tasklists_plugin
 
-# A regex that matches a URL scheme and a following colon, as is valid in CommonMark
-RE_COMMONMARK_URL_SCHEME = re.compile("[A-Za-z][A-Za-z0-9+.-]{1,31}:")
+from mdformat_gfm._mdit_gfm_autolink_plugin import gfm_autolink_plugin
 
 
 def update_mdit(mdit: MarkdownIt) -> None:
-    # Enable linkify-it-py (for GFM autolink extension)
-    mdit.options["linkify"] = True
-    mdit.enable("linkify")
+    # Enable GFM autolink extension
+    mdit.use(gfm_autolink_plugin)
 
     # Enable mdformat-tables plugin
     tables_plugin = mdformat.plugins.PARSER_EXTENSIONS["tables"]
@@ -111,20 +109,8 @@ def _postprocess_inline(text: str, node: RenderTreeNode, context: RenderContext)
     return text
 
 
-def _link_renderer(node: RenderTreeNode, context: RenderContext) -> str:
-    """Extend the default link renderer to handle linkify links."""
-    if node.markup == "linkify":
-        autolink_url = node.attrs["href"]
-        assert isinstance(autolink_url, str)
-        startswith_scheme = RE_COMMONMARK_URL_SCHEME.match(autolink_url)
-        if startswith_scheme and not node.children[0].content.startswith(
-            startswith_scheme.group()
-        ):
-            autolink_url = autolink_url.split(":", maxsplit=1)[1]
-            if autolink_url.startswith("//"):
-                autolink_url = autolink_url[2:]
-        return autolink_url
-    return _render_with_default_renderer(node, context)
+def _gfm_autolink_renderer(node: RenderTreeNode, context: RenderContext) -> str:
+    return node.meta["source_text"]
 
 
 def _escape_text(text: str, node: RenderTreeNode, context: RenderContext) -> str:
@@ -147,7 +133,7 @@ def _escape_paragraph(text: str, node: RenderTreeNode, context: RenderContext) -
 RENDERERS = {
     "s": _strikethrough_renderer,
     "list_item": _list_item_renderer,
-    "link": _link_renderer,
+    "gfm_autolink": _gfm_autolink_renderer,
 }
 POSTPROCESSORS = {
     "text": _escape_text,

diff --git a/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py b/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py
@@ -0,0 +1,216 @@
+import re
+
+from markdown_it import MarkdownIt
+from markdown_it.rules_inline import StateInline
+
+from mdformat_gfm import _gfm
+from mdformat_gfm._text_inline_rule import text_rule
+
+
+def gfm_autolink_plugin(md: MarkdownIt) -> None:
+    """Markdown-it plugin to parse GFM autolinks."""
+    md.inline.ruler.before("linkify", "gfm_autolink", gfm_autolink)
+
+    # The default "text" inline rule will skip starting characters of GFM
+    # autolinks. It can be disabled, but that is disastrous for performance.
+    # Instead, we replace it with a custom "text" inline rule that yields at
+    # locations that can potentially be the beginning of a GFM autolink.
+    md.inline.ruler.at("text", text_rule)
+
+
+# A string that matches this must still be invalidated if it ends with "_" or "-"
+RE_GFM_EMAIL = re.compile(r"[a-zA-Z0-9._+-]+@[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)+")
+# A string that matches this must still be invalidated if last two segments contain "_"
+RE_GFM_AUTOLINK_DOMAIN = re.compile(r"[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)+")
+
+RE_ENDS_IN_ENTITY_REF = re.compile(r"&[a-zA-Z0-9]+;\Z")
+
+ASCII_ALPHANUMERICS = frozenset(
+    "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789"
+)
+
+
+def gfm_autolink(state: StateInline, silent: bool) -> bool:  # noqa: C901
+    """Markdown-it-py rule to parse GFM autolinks.
+
+    This parser autolinks as specified here:
+    https://github.github.com/gfm/#autolinks-extension-
+
+    Args:
+        state: Parse state object.
+        silent: Disables token generation.
+    Returns:
+        bool: True if GFM autolink found.
+    """
+    pos = state.pos
+    src = state.src
+
+    # Autolink can only be at the beginning of a line, after whitespace,
+    # or any of the delimiting characters *, _, ~, and (.
+    if pos:
+        preceding_char = src[pos - 1]
+        if preceding_char not in _gfm.BEFORE_AUTOLINK_CHARS:
+            return False
+
+    if src.startswith("www.", pos):
+        pos += 4
+        try:
+            pos, domain, resource = read_domain_and_resource(src, pos)
+        except NotFound:
+            return False
+
+        url = f"www.{domain}{resource}"
+        full_url = "http://" + url
+    elif src.startswith(("http://", "https://"), pos):
+        scheme = "https://" if src[pos + 4] == "s" else "http://"
+        pos += len(scheme)
+
+        try:
+            pos, domain, resource = read_domain_and_resource(src, pos)
+        except NotFound:
+            return False
+
+        url = f"{scheme}{domain}{resource}"
+        full_url = url
+    elif src.startswith(("mailto:", "xmpp:"), pos):
+        scheme = "xmpp:" if src[pos] == "x" else "mailto:"
+        pos += len(scheme)
+
+        try:
+            pos, email = read_email(src, pos)
+        except NotFound:
+            return False
+
+        if scheme == "xmpp:" and src[pos : pos + 1] == "/":
+            pos += 1
+            resource_start_pos = pos
+            while pos < len(src) and src[pos] in ASCII_ALPHANUMERICS | {".", "@"}:
+                pos += 1
+            resource = src[resource_start_pos:pos]
+            if resource.endswith("."):
+                pos -= 1
+                resource = resource[:-1]
+            if not resource:
+                return False
+        else:
+            resource = ""
+
+        source_autolink = scheme + email
+        if resource:
+            source_autolink += "/" + resource
+
+        url = source_autolink
+        full_url = source_autolink
+    else:
+        try:
+            pos, email = read_email(src, pos)
+        except NotFound:
+            return False
+
+        url = email
+        full_url = "mailto:" + email
+
+    normalized_full_url = state.md.normalizeLink(full_url)
+    if not state.md.validateLink(normalized_full_url):
+        return False
+
+    push_tokens(state, normalized_full_url, url, silent)
+    state.pos = pos
+    return True
+
+
+def push_tokens(
+    state: StateInline, full_url: str, source_url: str, silent: bool
+) -> None:
+    if silent:
+        return
+    token = state.push("gfm_autolink_open", "a", 1)
+    token.attrs = {"href": full_url}
+    token.meta = {"source_text": source_url}
+
+    token = state.push("text", "", 0)
+    token.content = state.md.normalizeLinkText(source_url)
+
+    state.push("gfm_autolink_close", "a", -1)
+
+
+def trim_resource(untrimmed: str) -> tuple[str, int]:
+    """Trim illegal trailing chars from autolink resource.
+
+    Trim trailing punctuation, parentheses and entity refs as per GFM
+    spec. Also trim backslashes. The spec does not mention backslash,
+    but I think it should. This is referred to as "extended autolink
+    path validation" in the GFM spec. Return a tuple with the trimmed
+    resource and the amount of characters removed.
+    """
+    i = len(untrimmed) - 1
+    while i >= 0:
+        c = untrimmed[i]
+        if c == ";":
+            ending_entity_match = RE_ENDS_IN_ENTITY_REF.search(untrimmed, endpos=i + 1)
+            if not ending_entity_match:
+                break
+            i = ending_entity_match.start()
+        elif c == ")":
+            if untrimmed.count("(", 0, i + 1) >= untrimmed.count(")", 0, i + 1):
+                break
+        elif c in {"?", "!", ".", ",", ":", "*", "_", "~"}:
+            pass
+        elif c == "\\":  # not part of the spec, but should be
+            pass
+        else:
+            break
+        i -= 1
+
+    trimmed = untrimmed[: i + 1]
+    trim_count = len(untrimmed) - len(trimmed)
+    return trimmed, trim_count
+
+
+class NotFound(Exception):
+    """Raised if a function didn't find what it was looking for."""
+
+
+def read_domain_and_resource(src: str, pos: int) -> tuple[int, str, str]:
+    """Read autolink domain and resource.
+
+    Raise NotFound if not found. Return a tuple (pos, domain, resource).
+    """
+    domain_match = RE_GFM_AUTOLINK_DOMAIN.match(src, pos)
+    if not domain_match:
+        raise NotFound
+    domain = domain_match.group()
+    pos = domain_match.end()
+    segments = domain.rsplit(".", 2)
+    if "_" in segments[-2] or "_" in segments[-1]:
+        raise NotFound
+
+    resource_start_pos = pos
+    while pos < len(src) and src[pos] not in _gfm.WHITESPACE | {"<"}:
+        pos += 1
+    resource = src[resource_start_pos:pos]
+
+    resource, trim_count = trim_resource(resource)
+    pos -= trim_count
+    return pos, domain, resource
+
+
+def read_email(src: str, pos: int) -> tuple[int, str]:
+    """Read autolink email.
+
+    Raise NotFound if not found. Return a tuple (pos, email).
+    """
+    email_match = RE_GFM_EMAIL.match(src, pos)
+    email = email_match.group() if email_match else None
+    if not email or email[-1] in {"-", "_"}:
+        raise NotFound
+    assert email_match is not None
+    pos = email_match.end()
+
+    # This isn't really part of the GFM spec, but an attempt to cover
+    # up its flaws. If a trailing hyphen or underscore invalidates an
+    # autolink, then an escaped hyphen or underscore should too.
+    if src[pos : pos + 2] in {"\\-", "\\_"}:
+        raise NotFound
+
+    return pos, email
diff --git a/src/mdformat_gfm/_text_inline_rule.py b/src/mdformat_gfm/_text_inline_rule.py
@@ -0,0 +1,87 @@
+"""A replacement for the "text" inline rule in markdown-it.
+
+The default "text" rule will skip until the next character in
+`_TerminatorChars` is found. This extends the set of termination points
+to those that can potentially be the beginning of a GFM autolink. The
+GFM autolink plugin also works with "text" inline rule disabled, but
+this should (at least partially) bring back the performance boost that
+"text" inline rule provides.
+"""
+
+import re
+
+from markdown_it.rules_inline import StateInline
+
+from mdformat_gfm import _gfm
+
+# The default set of terminator characters
+_TerminatorChars = {
+    "\n",
+    "!",
+    "#",
+    "$",
+    "%",
+    "&",
+    "*",
+    "+",
+    "-",
+    ":",
+    "<",
+    "=",
+    ">",
+    "@",
+    "[",
+    "\\",
+    "]",
+    "^",
+    "_",
+    "`",
+    "{",
+    "}",
+    "~",
+}
+
+_default_terminator = "[" + re.escape("".join(_TerminatorChars)) + "]"
+_gfm_autolink_terminator = (
+    r"(?:" r"www\." "|" "http" "|" "mailto:" "|" "xmpp:" "|" r"[a-zA-Z0-9._+-]+@" r")"
+)
+_before_autolink = "[" + re.escape("".join(_gfm.BEFORE_AUTOLINK_CHARS)) + "]"
+
+_RE_TERMINATOR_FIRST_CHAR = re.compile(
+    _default_terminator + "|" + _gfm_autolink_terminator
+)
+_RE_TERMINATOR_NON_FIRST_CHAR = re.compile(
+    r"(?s:.)"  # match any character (also newline)
+    + _default_terminator
+    + "|"
+    + _before_autolink
+    + _gfm_autolink_terminator
+)
+
+
+def text_rule(state: StateInline, silent: bool) -> bool:
+    pos = state.pos
+
+    # Handle the special case where `pos` is zero
+    if not pos:
+        if _RE_TERMINATOR_FIRST_CHAR.match(state.src):
+            return False
+        pos = 1
+
+    # Now `pos` cannot be zero, so we can search with a regex that looks at
+    # preceding character too.
+    terminator_match = _RE_TERMINATOR_NON_FIRST_CHAR.search(state.src, pos - 1)
+    if terminator_match:
+        pos = terminator_match.start() + 1
+    else:
+        pos = state.posMax
+
+    if pos == state.pos:
+        return False
+
+    if not silent:
+        state.pending += state.src[state.pos : pos]
+
+    state.pos = pos
+
+    return True