Merge pull request #1927 from aiven/preprocess-sitemap-exclude-html-e…

…xtension Preprocess sitemap exclude html extension
aiven · Jul 6, 2023 · 6547578 · 6547578
2 parents c932476 + 345a03e
commit 6547578
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 0 deletions.
diff --git a/Makefile b/Makefile
@@ -21,6 +21,13 @@ help:
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
+# This will overwrite the generated sitemap bu sphinx_sitemap to 
+# exclude index.html or .html extension in the <url>. This is to prevent
+# redirect loop (issue for search engine) since Cloudflare Pages redirect all .html to it's parent
+html:
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)	
+	python "$(SOURCEDIR)/scripts/postprocess_sitemap.py"
+
 livehtml:
 	sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 

diff --git a/requirements.txt b/requirements.txt
@@ -7,6 +7,7 @@ sphinx-external-toc==0.2.3
 sphinx-copybutton==0.5.0
 sphinx_gitstamp==0.3.2
 beautifulsoup4==4.9.3
+lxml==4.9.2
 opensearch-py==1.0.0
 requests==2.31.0
 sphinxext-opengraph==0.4.2

diff --git a/scripts/postprocess_sitemap.py b/scripts/postprocess_sitemap.py
@@ -0,0 +1,25 @@
+from bs4 import BeautifulSoup
+
+with open('./_build/html/sitemap.xml', 'r') as f:
+    contents = f.read()
+
+soup = BeautifulSoup(contents, 'xml')
+
+urls = soup.find_all('url')
+
+for url in urls:
+    loc = url.find('loc')
+    text = loc.string
+    # Remove the 'gen' and '404' pages
+    if '404' in text:
+        url.decompose()
+        continue
+    if text.endswith('genindex.html'):
+        loc.string = text[:-5]  # removes the ".html"
+    elif text.endswith('index.html'):
+        loc.string = text[:-10]  # removes the "index.html"
+    elif text.endswith('.html'):
+        loc.string = text[:-5]  # removes the ".html"
+
+with open('./_build/html/sitemap.xml', 'w') as f:
+    f.write(str(soup))