Skip to content

Commit ea27cc8

Browse files
authored
♻️ Refactor: Add linkifier rule to inline chain for full links (#279)
Fixes collision of emphasis and linkifier (so `http://example.org/foo._bar_-_baz` is now a single link, not emphasized). Emails and fuzzy links are not affected by this. Implements upstream: markdown-it/markdown-it@6b58ec4
1 parent ba96f34 commit ea27cc8

File tree

11 files changed

+234
-30
lines changed

11 files changed

+234
-30
lines changed

markdown_it/common/utils.py

+12
Original file line numberDiff line numberDiff line change
@@ -304,3 +304,15 @@ def normalizeReference(string: str) -> str:
304304
# most notably, `__proto__`)
305305
#
306306
return string.lower().upper()
307+
308+
309+
LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
310+
LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
311+
312+
313+
def isLinkOpen(string: str) -> bool:
314+
return bool(LINK_OPEN_RE.search(string))
315+
316+
317+
def isLinkClose(string: str) -> bool:
318+
return bool(LINK_CLOSE_RE.search(string))

markdown_it/parser_inline.py

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# Parser rules
1717
_rules: list[tuple[str, RuleFunc]] = [
1818
("text", rules_inline.text),
19+
("linkify", rules_inline.linkify),
1920
("newline", rules_inline.newline),
2021
("escape", rules_inline.escape),
2122
("backticks", rules_inline.backtick),

markdown_it/presets/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def make() -> PresetType:
2121
config = commonmark.make()
2222
config["components"]["core"]["rules"].append("linkify")
2323
config["components"]["block"]["rules"].append("table")
24-
config["components"]["inline"]["rules"].append("strikethrough")
24+
config["components"]["inline"]["rules"].extend(["strikethrough", "linkify"])
2525
config["components"]["inline"]["rules2"].append("strikethrough")
2626
config["options"]["linkify"] = True
2727
config["options"]["html"] = True

markdown_it/rules_core/linkify.py

+37-29
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,32 @@
1+
from __future__ import annotations
2+
13
import re
4+
from typing import Protocol
25

3-
from ..common.utils import arrayReplaceAt
6+
from ..common.utils import arrayReplaceAt, isLinkClose, isLinkOpen
47
from ..token import Token
58
from .state_core import StateCore
69

7-
LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
8-
LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
9-
1010
HTTP_RE = re.compile(r"^http://")
1111
MAILTO_RE = re.compile(r"^mailto:")
1212
TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE)
1313

1414

15-
def isLinkOpen(string: str) -> bool:
16-
return bool(LINK_OPEN_RE.search(string))
17-
18-
19-
def isLinkClose(string: str) -> bool:
20-
return bool(LINK_CLOSE_RE.search(string))
21-
22-
2315
def linkify(state: StateCore) -> None:
24-
blockTokens = state.tokens
25-
16+
"""Rule for identifying plain-text links."""
2617
if not state.md.options.linkify:
2718
return
2819

2920
if not state.md.linkify:
3021
raise ModuleNotFoundError("Linkify enabled but not installed.")
3122

32-
for j in range(len(blockTokens)):
33-
if blockTokens[j].type != "inline" or not state.md.linkify.pretest(
34-
blockTokens[j].content
23+
for inline_token in state.tokens:
24+
if inline_token.type != "inline" or not state.md.linkify.pretest(
25+
inline_token.content
3526
):
3627
continue
3728

38-
tokens = blockTokens[j].children
29+
tokens = inline_token.children
3930

4031
htmlLinkLevel = 0
4132

@@ -71,38 +62,47 @@ def linkify(state: StateCore) -> None:
7162
currentToken.content
7263
):
7364
text = currentToken.content
74-
links = state.md.linkify.match(text)
65+
links: list[_LinkType] = state.md.linkify.match(text) or []
7566

7667
# Now split string to nodes
7768
nodes = []
7869
level = currentToken.level
7970
lastPos = 0
8071

81-
for ln in range(len(links)):
82-
url = links[ln].url
72+
# forbid escape sequence at the start of the string,
73+
# this avoids http\://example.com/ from being linkified as
74+
# http:<a href="//example.com/">//example.com/</a>
75+
if (
76+
links
77+
and links[0].index == 0
78+
and i > 0
79+
and tokens[i - 1].type == "text_special"
80+
):
81+
links = links[1:]
82+
83+
for link in links:
84+
url = link.url
8385
fullUrl = state.md.normalizeLink(url)
8486
if not state.md.validateLink(fullUrl):
8587
continue
8688

87-
urlText = links[ln].text
89+
urlText = link.text
8890

8991
# Linkifier might send raw hostnames like "example.com", where url
9092
# starts with domain name. So we prepend http:// in those cases,
9193
# and remove it afterwards.
92-
if not links[ln].schema:
94+
if not link.schema:
9395
urlText = HTTP_RE.sub(
9496
"", state.md.normalizeLinkText("http://" + urlText)
9597
)
96-
elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search(
97-
urlText
98-
):
98+
elif link.schema == "mailto:" and TEST_MAILTO_RE.search(urlText):
9999
urlText = MAILTO_RE.sub(
100100
"", state.md.normalizeLinkText("mailto:" + urlText)
101101
)
102102
else:
103103
urlText = state.md.normalizeLinkText(urlText)
104104

105-
pos = links[ln].index
105+
pos = link.index
106106

107107
if pos > lastPos:
108108
token = Token("text", "", 0)
@@ -130,12 +130,20 @@ def linkify(state: StateCore) -> None:
130130
token.info = "auto"
131131
nodes.append(token)
132132

133-
lastPos = links[ln].last_index
133+
lastPos = link.last_index
134134

135135
if lastPos < len(text):
136136
token = Token("text", "", 0)
137137
token.content = text[lastPos:]
138138
token.level = level
139139
nodes.append(token)
140140

141-
blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes)
141+
inline_token.children = tokens = arrayReplaceAt(tokens, i, nodes)
142+
143+
144+
class _LinkType(Protocol):
145+
url: str
146+
text: str
147+
index: int
148+
last_index: int
149+
schema: str | None

markdown_it/rules_inline/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"text",
44
"fragments_join",
55
"link_pairs",
6+
"linkify",
67
"escape",
78
"newline",
89
"backtick",
@@ -24,6 +25,7 @@
2425
from .html_inline import html_inline
2526
from .image import image
2627
from .link import link
28+
from .linkify import linkify
2729
from .newline import newline
2830
from .state_inline import StateInline
2931
from .text import text

markdown_it/rules_inline/html_inline.py

+6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Process html tags
22
from ..common.html_re import HTML_TAG_RE
3+
from ..common.utils import isLinkClose, isLinkOpen
34
from .state_inline import StateInline
45

56

@@ -33,5 +34,10 @@ def html_inline(state: StateInline, silent: bool) -> bool:
3334
token = state.push("html_inline", "", 0)
3435
token.content = state.src[pos : pos + len(match.group(0))]
3536

37+
if isLinkOpen(token.content):
38+
state.linkLevel += 1
39+
if isLinkClose(token.content):
40+
state.linkLevel -= 1
41+
3642
state.pos += len(match.group(0))
3743
return True

markdown_it/rules_inline/link.py

+2
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,9 @@ def link(state: StateInline, silent: bool) -> bool:
140140
if label and state.md.options.get("store_labels", False):
141141
token.meta["label"] = label
142142

143+
state.linkLevel += 1
143144
state.md.inline.tokenize(state)
145+
state.linkLevel -= 1
144146

145147
token = state.push("link_close", "a", -1)
146148

markdown_it/rules_inline/linkify.py

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""Process links like https://example.org/"""
2+
import re
3+
4+
from .state_inline import StateInline
5+
6+
# RFC3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
7+
SCHEME_RE = re.compile(r"(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$", re.IGNORECASE)
8+
9+
10+
def linkify(state: StateInline, silent: bool) -> bool:
11+
"""Rule for identifying plain-text links."""
12+
if not state.md.options.linkify:
13+
return False
14+
if state.linkLevel > 0:
15+
return False
16+
if not state.md.linkify:
17+
raise ModuleNotFoundError("Linkify enabled but not installed.")
18+
19+
pos = state.pos
20+
maximum = state.posMax
21+
22+
if (
23+
(pos + 3) > maximum
24+
or state.src[pos] != ":"
25+
or state.src[pos + 1] != "/"
26+
or state.src[pos + 2] != "/"
27+
):
28+
return False
29+
30+
if not (match := SCHEME_RE.match(state.pending)):
31+
return False
32+
33+
proto = match.group(1)
34+
if not (link := state.md.linkify.match_at_start(state.src[pos - len(proto) :])):
35+
return False
36+
url: str = link.url
37+
38+
# disallow '*' at the end of the link (conflicts with emphasis)
39+
url = url.rstrip("*")
40+
41+
full_url = state.md.normalizeLink(url)
42+
if not state.md.validateLink(full_url):
43+
return False
44+
45+
if not silent:
46+
state.pending = state.pending[: -len(proto)]
47+
48+
token = state.push("link_open", "a", 1)
49+
token.attrs = {"href": full_url}
50+
token.markup = "linkify"
51+
token.info = "auto"
52+
53+
token = state.push("text", "", 0)
54+
token.content = state.md.normalizeLinkText(url)
55+
56+
token = state.push("link_close", "a", -1)
57+
token.markup = "linkify"
58+
token.info = "auto"
59+
60+
state.pos += len(url) - len(proto)
61+
return True

markdown_it/rules_inline/state_inline.py

+4
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ def __init__(
7070
self.backticks: dict[int, int] = {}
7171
self.backticksScanned = False
7272

73+
# Counter used to disable inline linkify-it execution
74+
# inside <a> and markdown links
75+
self.linkLevel = 0
76+
7377
def __repr__(self) -> str:
7478
return (
7579
f"{self.__class__.__name__}"

tests/test_api/test_main.py

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def test_get_rules():
3030
],
3131
"inline": [
3232
"text",
33+
"linkify",
3334
"newline",
3435
"escape",
3536
"backticks",

0 commit comments

Comments
 (0)