Skip to content

Commit 560e6d2

Browse files
authored
docs: script to convert HTML manual pages to markdown (#4620)
1 parent 344096a commit 560e6d2

File tree

2 files changed

+77
-0
lines changed

2 files changed

+77
-0
lines changed

utils/grass_html2md.sh

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
set -eu
3+
4+
###############################################################################
5+
# Convert recursively all .html files to .md (GitHub flavoured Markdown)
6+
#
7+
# Dependencies:
8+
# pandoc
9+
# wget
10+
#
11+
# Author(s):
12+
# Martin Landa, Markus Neteler
13+
#
14+
# Usage:
15+
# If you have "pandoc" in PATH, execute for HTML file conversion in
16+
# current directory and subdirectories:
17+
# ./utils/grass_html2md.sh
18+
#
19+
# COPYRIGHT: (C) 2024 by the GRASS Development Team
20+
#
21+
# This program is free software under the GNU General Public
22+
# License (>=v2). Read the file COPYING that comes with GRASS
23+
# for details.
24+
#
25+
###############################################################################
26+
27+
# cleanup at user break
28+
cleanup()
29+
{
30+
rm -f "${f%%.html}_tmp.html"
31+
}
32+
33+
# what to do in case of user break:
34+
exitprocedure()
35+
{
36+
echo "User break!"
37+
cleanup
38+
exit 1
39+
}
40+
# shell check for user break (signal list: trap -l)
41+
trap "exitprocedure" 2 3 15
42+
43+
# path to LUA file (./utils/pandoc_codeblock.lua)
44+
UTILSPATH="utils"
45+
46+
# run recursively: HTML to MD
47+
for f in $(find . -name *.html); do
48+
echo "${f}"
49+
50+
# HTML: Process the tmp file to selectively replace .html with .md only in relative URLs
51+
sed -E '
52+
# Step 1: Preserve http/https links with .html (and optional anchors)
53+
s|(<a href="https?://[^"]+\.html)(#[^"]*)?">|\1_KEEPHTML\2">|g;
54+
# Step 2: Replace .html with .md for local links (with or without anchors)
55+
s|(<a href=")([^"]+)\.html(#[^"]*)?">|\1\2.md\3">|g;
56+
# Step 3: Restore preserved http/https links with .html
57+
s|_KEEPHTML||g;
58+
' "${f%%.html}.html" > "${f%%.html}_tmp.html"
59+
60+
cat "${f%%.html}_tmp.html" | \
61+
sed 's#<div class="code"><pre>#<pre><code>#g' | \
62+
sed 's#</pre></div>#</code></pre>#g' | \
63+
pandoc --from=html --to=markdown -t gfm \
64+
--lua-filter "${UTILSPATH}/pandoc_codeblock.lua" | \
65+
sed 's+ \\\$+ \$+g' | sed 's+%20+-+g' > "${f%%.html}.md"
66+
67+
rm -f "${f%%.html}_tmp.html"
68+
69+
done

utils/pandoc_codeblock.lua

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
-- Pandoc Lua filter to handle code blocks
2+
-- Test cases
3+
-- raster/r.sun/r.sun.html
4+
5+
-- Function to convert code blocks to markdown
6+
function CodeBlock (cb)
7+
return pandoc.RawBlock('markdown', '```shell\n' .. cb.text .. '\n```\n')
8+
end

0 commit comments

Comments
 (0)