Skip to content

Include HTML headers in TOC #538

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- [pull #532] Fix #493 persisting when `code-friendly` extra enabled
- [pull #535] Update `_slugify` to use utf-8 encoding (issue #534)
- [pull #536] Maintain order of appearance in footnotes
- [pull #538] Include HTML headers in TOC

## python-markdown2 2.4.10

Expand Down
61 changes: 60 additions & 1 deletion lib/markdown2.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,13 @@ def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
else:
self._toc_depth = self.extras["toc"].get("depth", 6)

if 'header-ids' in self.extras:
if not isinstance(self.extras['header-ids'], dict):
self.extras['header-ids'] = {
'mixed': False,
'prefix': self.extras['header-ids']
}

if 'break-on-newline' in self.extras:
self.extras.setdefault('breaks', {})
self.extras['breaks']['on_newline'] = True
Expand Down Expand Up @@ -424,6 +431,17 @@ def convert(self, text):
text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text)

if "toc" in self.extras and self._toc:
if self.extras['header-ids'].get('mixed'):
# TOC will only be out of order if mixed headers is enabled
def toc_sort(entry):
'''Sort the TOC by order of appearance in text'''
return re.search(
# header tag, any attrs, the ID, any attrs, the text, close tag
r'^<(h%d).*?id=(["\'])%s\2.*>%s</\1>$' % (entry[0], entry[1], re.escape(entry[2])),
text, re.M
).start()

self._toc.sort(key=toc_sort)
self._toc_html = calculate_toc_html(self._toc)

# Prepend toc html to output
Expand Down Expand Up @@ -783,6 +801,8 @@ def _hash_html_block_sub(self, match, raw=False):
return ''.join(["\n\n", f_key,
"\n\n", middle, "\n\n",
l_key, "\n\n"])
elif self.extras.get('header-ids', {}).get('mixed') and self._h_tag_re.match(html):
html = self._h_tag_re.sub(self._h_tag_sub, html)
key = _hash_text(html)
self.html_blocks[key] = html
return "\n\n" + key + "\n\n"
Expand Down Expand Up @@ -1786,6 +1806,13 @@ def header_id_from_text(self, text, prefix, n):

return header_id

def _header_id_exists(self, text):
header_id = _slugify(text)
prefix = self.extras['header-ids'].get('prefix')
if prefix and isinstance(prefix, str):
header_id = prefix + '-' + header_id
return header_id in self._count_from_header_id

def _toc_add_entry(self, level, id, name):
if level > self._toc_depth:
return
Expand All @@ -1810,6 +1837,7 @@ def _toc_add_entry(self, level, id, name):
_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)

def _h_sub(self, match):
'''Handles processing markdown headers'''
if match.group(1) is not None and match.group(3) == "-":
return match.group(1)
elif match.group(1) is not None:
Expand All @@ -1827,14 +1855,45 @@ def _h_sub(self, match):
header_id_attr = ""
if "header-ids" in self.extras:
header_id = self.header_id_from_text(header_group,
self.extras["header-ids"], n)
self.extras["header-ids"].get('prefix'), n)
if header_id:
header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(header_group)
if "toc" in self.extras and header_id:
self._toc_add_entry(n, header_id, html)
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)

_h_tag_re = re.compile(r'''
^<h([1-6])(.*)> # \1 tag num, \2 attrs
(.*) # \3 text
</h\1>
''', re.X | re.M)

def _h_tag_sub(self, match):
'''Different to `_h_sub` in that this function handles existing HTML headers'''
text = match.string[match.start(): match.end()]
h_level = int(match.group(1))
# extract id= attr from tag, trying to account for regex "misses"
id_attr = (re.match(r'.*?id=(\S+)?.*', match.group(2) or '') or '')
if id_attr:
# if id attr exists, extract that
id_attr = id_attr.group(1) or ''
id_attr = id_attr.strip('\'" ')
h_text = match.group(3)

# check if header was already processed (ie: was a markdown header rather than HTML)
if id_attr and self._header_id_exists(id_attr):
return text

# generate new header id if none existed
header_id = id_attr or self.header_id_from_text(h_text, self.extras['header-ids'].get('prefix'), h_level)
if "toc" in self.extras:
self._toc_add_entry(h_level, header_id, h_text)
if header_id and not id_attr:
# '<h[digit]' + new ID + '...'
return text[:3] + ' id="%s"' % header_id + text[3:]
return text

def _do_headers(self, text):
# Setext-style headers:
# Header 1
Expand Down
11 changes: 11 additions & 0 deletions test/tm-cases/mixed_header_ids.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<h1 id="header-1">Header 1</h1>

<h2 id="header-2">Header 2</h2>

<h1 id="header-3">Header 3</h1>

<h4 id="header-4" class="myclass">Header 4</h4>

<h1 id="header-5">Header 5</h1>

<h6 id="my-important-id">Header 6</h6>
1 change: 1 addition & 0 deletions test/tm-cases/mixed_header_ids.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"extras": {"header-ids": {"mixed": True}, "toc": None}}
11 changes: 11 additions & 0 deletions test/tm-cases/mixed_header_ids.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Header 1

<h2>Header 2</h2>

# Header 3

<h4 class="myclass">Header 4</h4>

# Header 5

<h6 id="my-important-id">Header 6</h6>
14 changes: 14 additions & 0 deletions test/tm-cases/mixed_header_ids.toc_html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<ul>
<li><a href="#header-1">Header 1</a>
<ul>
<li><a href="#header-2">Header 2</a></li>
</ul></li>
<li><a href="#header-3">Header 3</a>
<ul>
<li><a href="#header-4">Header 4</a></li>
</ul></li>
<li><a href="#header-5">Header 5</a>
<ul>
<li><a href="#my-important-id">Header 6</a></li>
</ul></li>
</ul>