Skip to content

Commit

Permalink
Replace linkify-it-py dependency with a GFM autolink plugin for markd…
Browse files Browse the repository at this point in the history
…own-it (#48)
  • Loading branch information
hukkin authored Dec 12, 2024
1 parent ec9e66f commit 16f26ae
Show file tree
Hide file tree
Showing 9 changed files with 600 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
run: |
pip install pre-commit mypy==1.11.2
pre-commit run --all-files
mypy .
mypy src/ tests/
pypi-publish:
# Only publish if all other jobs succeed
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ license = { file = "LICENSE" }
requires-python = ">=3.9"
dependencies = [
'mdformat >=0.7.5,<0.8.0',
'markdown-it-py[linkify]', # Let `mdformat` choose version boundaries for `markdown-it-py`
'markdown-it-py', # Let `mdformat` choose version boundaries for `markdown-it-py`
'mdit-py-plugins >=0.2.0',
'mdformat-tables >=0.4.0',
]
Expand All @@ -29,7 +29,7 @@ keywords = ["mdformat", "markdown", "formatter", "gfm"]
"Homepage" = "https://github.com/hukkin/mdformat-gfm"

[project.entry-points."mdformat.parser_extension"]
"gfm" = "mdformat_gfm.plugin"
"gfm" = "mdformat_gfm._mdformat_plugin"


[tool.tox]
Expand All @@ -56,7 +56,7 @@ deps = [
"-r tests/requirements.txt",
"mypy ==1.11.2",
]
commands = [["mypy", { replace = "posargs", default = ["."], extend = true }]]
commands = [["mypy", { replace = "posargs", default = ["src/", "tests/"], extend = true }]]


[tool.isort]
Expand Down
6 changes: 6 additions & 0 deletions src/mdformat_gfm/_gfm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Whitespace characters, as specified in
# https://github.github.com/gfm/#whitespace-character
# (spec version 0.29-gfm (2019-04-06)
WHITESPACE = frozenset(" \t\n\v\f\r")

BEFORE_AUTOLINK_CHARS = WHITESPACE | {"*", "_", "~", "("}
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
from mdformat.renderer import DEFAULT_RENDERERS, RenderContext, RenderTreeNode
from mdit_py_plugins.tasklists import tasklists_plugin

# A regex that matches a URL scheme and a following colon, as is valid in CommonMark
RE_COMMONMARK_URL_SCHEME = re.compile("[A-Za-z][A-Za-z0-9+.-]{1,31}:")
from mdformat_gfm._mdit_gfm_autolink_plugin import gfm_autolink_plugin


def update_mdit(mdit: MarkdownIt) -> None:
# Enable linkify-it-py (for GFM autolink extension)
mdit.options["linkify"] = True
mdit.enable("linkify")
# Enable GFM autolink extension
mdit.use(gfm_autolink_plugin)

# Enable mdformat-tables plugin
tables_plugin = mdformat.plugins.PARSER_EXTENSIONS["tables"]
Expand Down Expand Up @@ -111,20 +109,8 @@ def _postprocess_inline(text: str, node: RenderTreeNode, context: RenderContext)
return text


def _link_renderer(node: RenderTreeNode, context: RenderContext) -> str:
"""Extend the default link renderer to handle linkify links."""
if node.markup == "linkify":
autolink_url = node.attrs["href"]
assert isinstance(autolink_url, str)
startswith_scheme = RE_COMMONMARK_URL_SCHEME.match(autolink_url)
if startswith_scheme and not node.children[0].content.startswith(
startswith_scheme.group()
):
autolink_url = autolink_url.split(":", maxsplit=1)[1]
if autolink_url.startswith("//"):
autolink_url = autolink_url[2:]
return autolink_url
return _render_with_default_renderer(node, context)
def _gfm_autolink_renderer(node: RenderTreeNode, context: RenderContext) -> str:
return node.meta["source_text"]


def _escape_text(text: str, node: RenderTreeNode, context: RenderContext) -> str:
Expand All @@ -147,7 +133,7 @@ def _escape_paragraph(text: str, node: RenderTreeNode, context: RenderContext) -
RENDERERS = {
"s": _strikethrough_renderer,
"list_item": _list_item_renderer,
"link": _link_renderer,
"gfm_autolink": _gfm_autolink_renderer,
}
POSTPROCESSORS = {
"text": _escape_text,
Expand Down
216 changes: 216 additions & 0 deletions src/mdformat_gfm/_mdit_gfm_autolink_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import re

from markdown_it import MarkdownIt
from markdown_it.rules_inline import StateInline

from mdformat_gfm import _gfm
from mdformat_gfm._text_inline_rule import text_rule


def gfm_autolink_plugin(md: MarkdownIt) -> None:
"""Markdown-it plugin to parse GFM autolinks."""
md.inline.ruler.before("linkify", "gfm_autolink", gfm_autolink)

# The default "text" inline rule will skip starting characters of GFM
# autolinks. It can be disabled, but that is disastrous for performance.
# Instead, we replace it with a custom "text" inline rule that yields at
# locations that can potentially be the beginning of a GFM autolink.
md.inline.ruler.at("text", text_rule)


# A string that matches this must still be invalidated if it ends with "_" or "-"
RE_GFM_EMAIL = re.compile(r"[a-zA-Z0-9._+-]+@[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)+")
# A string that matches this must still be invalidated if last two segments contain "_"
RE_GFM_AUTOLINK_DOMAIN = re.compile(r"[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)+")

RE_ENDS_IN_ENTITY_REF = re.compile(r"&[a-zA-Z0-9]+;\Z")

ASCII_ALPHANUMERICS = frozenset(
"abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789"
)


def gfm_autolink(state: StateInline, silent: bool) -> bool: # noqa: C901
"""Markdown-it-py rule to parse GFM autolinks.
This parser autolinks as specified here:
https://github.github.com/gfm/#autolinks-extension-
Args:
state: Parse state object.
silent: Disables token generation.
Returns:
bool: True if GFM autolink found.
"""
pos = state.pos
src = state.src

# Autolink can only be at the beginning of a line, after whitespace,
# or any of the delimiting characters *, _, ~, and (.
if pos:
preceding_char = src[pos - 1]
if preceding_char not in _gfm.BEFORE_AUTOLINK_CHARS:
return False

if src.startswith("www.", pos):
pos += 4
try:
pos, domain, resource = read_domain_and_resource(src, pos)
except NotFound:
return False

url = f"www.{domain}{resource}"
full_url = "http://" + url
elif src.startswith(("http://", "https://"), pos):
scheme = "https://" if src[pos + 4] == "s" else "http://"
pos += len(scheme)

try:
pos, domain, resource = read_domain_and_resource(src, pos)
except NotFound:
return False

url = f"{scheme}{domain}{resource}"
full_url = url
elif src.startswith(("mailto:", "xmpp:"), pos):
scheme = "xmpp:" if src[pos] == "x" else "mailto:"
pos += len(scheme)

try:
pos, email = read_email(src, pos)
except NotFound:
return False

if scheme == "xmpp:" and src[pos : pos + 1] == "/":
pos += 1
resource_start_pos = pos
while pos < len(src) and src[pos] in ASCII_ALPHANUMERICS | {".", "@"}:
pos += 1
resource = src[resource_start_pos:pos]
if resource.endswith("."):
pos -= 1
resource = resource[:-1]
if not resource:
return False
else:
resource = ""

source_autolink = scheme + email
if resource:
source_autolink += "/" + resource

url = source_autolink
full_url = source_autolink
else:
try:
pos, email = read_email(src, pos)
except NotFound:
return False

url = email
full_url = "mailto:" + email

normalized_full_url = state.md.normalizeLink(full_url)
if not state.md.validateLink(normalized_full_url):
return False

push_tokens(state, normalized_full_url, url, silent)
state.pos = pos
return True


def push_tokens(
state: StateInline, full_url: str, source_url: str, silent: bool
) -> None:
if silent:
return
token = state.push("gfm_autolink_open", "a", 1)
token.attrs = {"href": full_url}
token.meta = {"source_text": source_url}

token = state.push("text", "", 0)
token.content = state.md.normalizeLinkText(source_url)

state.push("gfm_autolink_close", "a", -1)


def trim_resource(untrimmed: str) -> tuple[str, int]:
"""Trim illegal trailing chars from autolink resource.
Trim trailing punctuation, parentheses and entity refs as per GFM
spec. Also trim backslashes. The spec does not mention backslash,
but I think it should. This is referred to as "extended autolink
path validation" in the GFM spec. Return a tuple with the trimmed
resource and the amount of characters removed.
"""
i = len(untrimmed) - 1
while i >= 0:
c = untrimmed[i]
if c == ";":
ending_entity_match = RE_ENDS_IN_ENTITY_REF.search(untrimmed, endpos=i + 1)
if not ending_entity_match:
break
i = ending_entity_match.start()
elif c == ")":
if untrimmed.count("(", 0, i + 1) >= untrimmed.count(")", 0, i + 1):
break
elif c in {"?", "!", ".", ",", ":", "*", "_", "~"}:
pass
elif c == "\\": # not part of the spec, but should be
pass
else:
break
i -= 1

trimmed = untrimmed[: i + 1]
trim_count = len(untrimmed) - len(trimmed)
return trimmed, trim_count


class NotFound(Exception):
"""Raised if a function didn't find what it was looking for."""


def read_domain_and_resource(src: str, pos: int) -> tuple[int, str, str]:
"""Read autolink domain and resource.
Raise NotFound if not found. Return a tuple (pos, domain, resource).
"""
domain_match = RE_GFM_AUTOLINK_DOMAIN.match(src, pos)
if not domain_match:
raise NotFound
domain = domain_match.group()
pos = domain_match.end()
segments = domain.rsplit(".", 2)
if "_" in segments[-2] or "_" in segments[-1]:
raise NotFound

resource_start_pos = pos
while pos < len(src) and src[pos] not in _gfm.WHITESPACE | {"<"}:
pos += 1
resource = src[resource_start_pos:pos]

resource, trim_count = trim_resource(resource)
pos -= trim_count
return pos, domain, resource


def read_email(src: str, pos: int) -> tuple[int, str]:
"""Read autolink email.
Raise NotFound if not found. Return a tuple (pos, email).
"""
email_match = RE_GFM_EMAIL.match(src, pos)
email = email_match.group() if email_match else None
if not email or email[-1] in {"-", "_"}:
raise NotFound
assert email_match is not None
pos = email_match.end()

# This isn't really part of the GFM spec, but an attempt to cover
# up its flaws. If a trailing hyphen or underscore invalidates an
# autolink, then an escaped hyphen or underscore should too.
if src[pos : pos + 2] in {"\\-", "\\_"}:
raise NotFound

return pos, email
87 changes: 87 additions & 0 deletions src/mdformat_gfm/_text_inline_rule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""A replacement for the "text" inline rule in markdown-it.
The default "text" rule will skip until the next character in
`_TerminatorChars` is found. This extends the set of termination points
to those that can potentially be the beginning of a GFM autolink. The
GFM autolink plugin also works with "text" inline rule disabled, but
this should (at least partially) bring back the performance boost that
"text" inline rule provides.
"""

import re

from markdown_it.rules_inline import StateInline

from mdformat_gfm import _gfm

# The default set of terminator characters
_TerminatorChars = {
"\n",
"!",
"#",
"$",
"%",
"&",
"*",
"+",
"-",
":",
"<",
"=",
">",
"@",
"[",
"\\",
"]",
"^",
"_",
"`",
"{",
"}",
"~",
}

_default_terminator = "[" + re.escape("".join(_TerminatorChars)) + "]"
_gfm_autolink_terminator = (
r"(?:" r"www\." "|" "http" "|" "mailto:" "|" "xmpp:" "|" r"[a-zA-Z0-9._+-]+@" r")"
)
_before_autolink = "[" + re.escape("".join(_gfm.BEFORE_AUTOLINK_CHARS)) + "]"

_RE_TERMINATOR_FIRST_CHAR = re.compile(
_default_terminator + "|" + _gfm_autolink_terminator
)
_RE_TERMINATOR_NON_FIRST_CHAR = re.compile(
r"(?s:.)" # match any character (also newline)
+ _default_terminator
+ "|"
+ _before_autolink
+ _gfm_autolink_terminator
)


def text_rule(state: StateInline, silent: bool) -> bool:
pos = state.pos

# Handle the special case where `pos` is zero
if not pos:
if _RE_TERMINATOR_FIRST_CHAR.match(state.src):
return False
pos = 1

# Now `pos` cannot be zero, so we can search with a regex that looks at
# preceding character too.
terminator_match = _RE_TERMINATOR_NON_FIRST_CHAR.search(state.src, pos - 1)
if terminator_match:
pos = terminator_match.start() + 1
else:
pos = state.posMax

if pos == state.pos:
return False

if not silent:
state.pending += state.src[state.pos : pos]

state.pos = pos

return True
Loading

0 comments on commit 16f26ae

Please sign in to comment.