1+ #!/usr/bin/env python3
2+ """
3+ Script to process markdown files before syncing to deploy branch.
4+
5+ This script:
6+ 1. Removes YAML frontmatter description sections from MD files
7+ 2. Converts level 2 titles to level 1 titles in SUMMARY.md
8+ 3. Removes <figure></figure> HTML elements but keeps content inside
9+ 4. Converts all GitBook {% %} tags to standard markdown format
10+ """
11+
12+ import os
13+ import re
14+ import sys
15+ from pathlib import Path
16+
17+
18+ def remove_yaml_frontmatter (content ):
19+ """Remove YAML frontmatter from markdown content."""
20+ # Match YAML frontmatter at the beginning of the file
21+ pattern = r'^---\n.*?\n---\n'
22+ return re .sub (pattern , '' , content , flags = re .DOTALL | re .MULTILINE )
23+
24+
25+ def process_summary_titles (content ):
26+ """Convert level 2 titles (##) to level 1 titles (#) in SUMMARY.md."""
27+ # Replace ## with #
28+ return re .sub (r'^## ' , '# ' , content , flags = re .MULTILINE )
29+
30+
31+ def remove_figure_tags (content ):
32+ """Remove <figure></figure> HTML elements but keep the content inside."""
33+ # Remove opening <figure> tags (with any attributes)
34+ content = re .sub (r'<figure[^>]*>' , '' , content )
35+
36+ # Remove closing </figure> tags
37+ content = re .sub (r'</figure>' , '' , content )
38+
39+ # Clean up any extra whitespace that might be left
40+ content = re .sub (r'\n\s*\n\s*\n' , '\n \n ' , content )
41+
42+ return content
43+
44+
45+ def convert_gitbook_tags (content ):
46+ """Convert all GitBook {% %} tags to standard markdown format."""
47+
48+ # Pattern to match {% embed url="..." %} tags
49+ embed_pattern = r'{%\s*embed\s+url="([^"]+)"\s*%}'
50+
51+ # Pattern to match any other {% ... %} tags (like code blocks, hints, etc.)
52+ general_tag_pattern = r'{%\s*([^%]+)\s*%}'
53+
54+ def convert_embed (match ):
55+ url = match .group (1 )
56+
57+ # Check if it's a YouTube URL and convert to proper markdown
58+ if 'youtube.com' in url or 'youtu.be' in url :
59+ # Extract video ID for YouTube URLs
60+ video_id = None
61+ if 'youtube.com/watch?v=' in url :
62+ video_id = url .split ('v=' )[1 ].split ('&' )[0 ]
63+ elif 'youtube.com/embed/' in url :
64+ video_id = url .split ('/embed/' )[1 ].split ('?' )[0 ]
65+ elif 'youtu.be/' in url :
66+ video_id = url .split ('youtu.be/' )[1 ].split ('?' )[0 ]
67+
68+ if video_id :
69+ # Return markdown format with thumbnail and link
70+ return f"[]({ url } )\n \n [Watch on YouTube]({ url } )"
71+ else :
72+ return f"[YouTube Video]({ url } )"
73+ else :
74+ # For non-YouTube URLs, just create a regular link
75+ return f"[View Content]({ url } )"
76+
77+ def convert_general_tag (match ):
78+ tag_content = match .group (1 ).strip ()
79+
80+ # Handle specific GitBook tags
81+ if tag_content .startswith ('hint' ):
82+ return "> **Note:** "
83+ elif tag_content .startswith ('code' ):
84+ return "" # Remove code block tags, keep the actual code
85+ elif tag_content .startswith ('endcode' ):
86+ return "" # Remove end code block tags
87+ elif 'url=' in tag_content and 'embed' not in tag_content :
88+ # Handle other URL-containing tags
89+ url_match = re .search (r'url="([^"]+)"' , tag_content )
90+ if url_match :
91+ return f"[Link]({ url_match .group (1 )} )"
92+
93+ # For unrecognized tags, just remove them
94+ return ""
95+
96+ # First handle embed tags specifically
97+ content = re .sub (embed_pattern , convert_embed , content )
98+
99+ # Then handle remaining {% %} tags
100+ content = re .sub (general_tag_pattern , convert_general_tag , content )
101+
102+ return content
103+
104+
105+ def process_markdown_file (file_path ):
106+ """Process a single markdown file."""
107+ try :
108+ with open (file_path , 'r' , encoding = 'utf-8' ) as f :
109+ content = f .read ()
110+
111+ original_content = content
112+
113+ # Apply all transformations
114+ content = remove_yaml_frontmatter (content )
115+ content = remove_figure_tags (content )
116+ content = convert_gitbook_tags (content )
117+
118+ # Special processing for SUMMARY.md
119+ if file_path .name == 'SUMMARY.md' :
120+ content = process_summary_titles (content )
121+
122+ # Only write if content changed
123+ if content != original_content :
124+ with open (file_path , 'w' , encoding = 'utf-8' ) as f :
125+ f .write (content )
126+ print (f"Processed: { file_path } " )
127+ return True
128+ else :
129+ print (f"No changes: { file_path } " )
130+ return False
131+
132+ except Exception as e :
133+ print (f"Error processing { file_path } : { e } " )
134+ return False
135+
136+
137+ def find_markdown_files (src_dir ):
138+ """Find all markdown files in the src directory."""
139+ src_path = Path (src_dir )
140+ if not src_path .exists ():
141+ print (f"Source directory { src_dir } does not exist" )
142+ return []
143+
144+ markdown_files = list (src_path .rglob ('*.md' ))
145+ return markdown_files
146+
147+
148+ def main ():
149+ """Main function to process all markdown files."""
150+ # Get the repository root directory
151+ script_dir = Path (__file__ ).parent
152+ repo_root = script_dir .parent .parent
153+
154+ print (f"Repository root: { repo_root } " )
155+
156+ # In main branch, files are in root directory, not src
157+ # In deploy branch, files are in src directory
158+ # Check which structure we're dealing with
159+ src_dir = repo_root / 'src'
160+ if src_dir .exists ():
161+ print (f"Found src directory: { src_dir } " )
162+ markdown_files = find_markdown_files (src_dir )
163+ else :
164+ print (f"Using root directory: { repo_root } " )
165+ markdown_files = find_markdown_files (repo_root )
166+
167+ # Find all markdown files
168+ if not markdown_files :
169+ print ("No markdown files found to process" )
170+ return 0
171+
172+ print (f"Found { len (markdown_files )} markdown files to process" )
173+
174+ # Process each file
175+ processed_count = 0
176+ for file_path in markdown_files :
177+ if process_markdown_file (file_path ):
178+ processed_count += 1
179+
180+ print (f"Successfully processed { processed_count } files" )
181+ return 0
182+
183+
184+ if __name__ == '__main__' :
185+ sys .exit (main ())
0 commit comments