-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate-index.py
57 lines (41 loc) · 2.03 KB
/
generate-index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import json
from bs4 import BeautifulSoup
# Directory containing HTML files
HTML_DIR = "./notebooks"
# Base URL for GitHub Pages
BASE_URL = "https://arghyadutta.github.io/"
def extract_text_from_tag(tag):
if tag.name in ["script", "style"]:
return ""
return " ".join(tag.stripped_strings)
def extract_content(file_path):
with open(file_path, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
# Get <h1> as title (fallback to filename if missing)
h1_tag = soup.find("h1")
title = h1_tag.get_text(strip=True) if h1_tag else os.path.basename(file_path)
# Extract text from <p>, <h2>-<h6>, and <div> tags (ignore script/style)
tags = soup.find_all(["p", "h2", "h3", "h4", "h5", "h6", "div", "dt", "dd", "li"])
# Use a set to deduplicate content
unique_content = set(extract_text_from_tag(tag) for tag in tags)
# Join the unique content into a single string
content = " ".join(unique_content)
# Debug: Print the content to check for duplicates
print(f"Extracted content from {file_path}:\n{content}\n")
# Get the relative path of the file (relative to the repository root)
relative_path = os.path.relpath(file_path, start=os.path.dirname(HTML_DIR))
# Construct the correct GitHub Pages URL
url = BASE_URL + relative_path.replace("\\", "/") # Ensure forward slashes
return {"title": title, "url": url, "content": content[:100000]}
# Generate search index
index_data = []
unique_files = set(os.listdir(HTML_DIR))
for filename in unique_files:
if filename.lower().endswith(".html") and not filename.startswith('.'):
file_path = os.path.abspath(os.path.join(HTML_DIR, filename))
index_data.append(extract_content(file_path))
# Save to search-index.json
with open("search-index.json", "w", encoding="utf-8") as json_file:
json.dump(index_data, json_file, indent=2, ensure_ascii=False)
print("Search index updated with correct GitHub Pages URLs!")