Skip to content
This repository has been archived by the owner on Jan 29, 2024. It is now read-only.

Commit

Permalink
Merge pull request #1927 from aiven/preprocess-sitemap-exclude-html-e…
Browse files Browse the repository at this point in the history
…xtension

Preprocess sitemap exclude html extension
  • Loading branch information
angelinekwan authored Jul 6, 2023
2 parents c932476 + 345a03e commit 6547578
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 0 deletions.
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ help:
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

# This will overwrite the generated sitemap bu sphinx_sitemap to
# exclude index.html or .html extension in the <url>. This is to prevent
# redirect loop (issue for search engine) since Cloudflare Pages redirect all .html to it's parent
html:
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
python "$(SOURCEDIR)/scripts/postprocess_sitemap.py"

livehtml:
sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ sphinx-external-toc==0.2.3
sphinx-copybutton==0.5.0
sphinx_gitstamp==0.3.2
beautifulsoup4==4.9.3
lxml==4.9.2
opensearch-py==1.0.0
requests==2.31.0
sphinxext-opengraph==0.4.2
Expand Down
25 changes: 25 additions & 0 deletions scripts/postprocess_sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from bs4 import BeautifulSoup

with open('./_build/html/sitemap.xml', 'r') as f:
contents = f.read()

soup = BeautifulSoup(contents, 'xml')

urls = soup.find_all('url')

for url in urls:
loc = url.find('loc')
text = loc.string
# Remove the 'gen' and '404' pages
if '404' in text:
url.decompose()
continue
if text.endswith('genindex.html'):
loc.string = text[:-5] # removes the ".html"
elif text.endswith('index.html'):
loc.string = text[:-10] # removes the "index.html"
elif text.endswith('.html'):
loc.string = text[:-5] # removes the ".html"

with open('./_build/html/sitemap.xml', 'w') as f:
f.write(str(soup))

0 comments on commit 6547578

Please sign in to comment.