ByteTrix · kovyrin · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/.env.example b/.env.example
@@ -61,6 +61,9 @@ RESUME_PARTIAL=true
 # Enable detailed logging for troubleshooting
 DEBUG=false
 
+# Download subtitles/captions when available (default: true)
+SUBTITLE_DOWNLOAD_ENABLED=true
+
 # ===============================================
 # ADVANCED SETTINGS
 # ===============================================
@@ -83,4 +86,4 @@ COURSE_DATA_FILE=""
 # ALL_VIDEO_FORMATS=false
 
 # Log level (DEBUG, INFO, WARNING, ERROR)
-# LOG_LEVEL="INFO"
+# LOG_LEVEL="INFO"
diff --git a/.gitignore b/.gitignore
@@ -85,6 +85,9 @@ downloads/
 *.mkv
 ffmpeg.log
 
+# Allow HTML templates used by the offline site generator
+!thinkific_downloader/templates/*.html
+
 # But allow certain JSON files
 !package.json
 !requirements.json

diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor
 | 📄 **HTML Content** | ✅ Full | `downloader.py` | Clean extraction, formatting |
 | 📚 **PDF Documents** | ✅ Full | `downloader.py` | Direct download, validation |
 | 🎵 **Audio Files** | ✅ Full | `downloader.py` | MP3, M4A support |
+| 📝 **Subtitles (Wistia)** | ✅ Full | `wistia_downloader.py` | Multi-language caption downloads |
 | 🎯 **Quizzes** | ✅ Basic | `downloader.py` | Structure extraction |
 | 🎨 **Presentations** | ✅ Full | FFmpeg merge | Multi-slide processing |
 
@@ -70,6 +71,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor
 - **Resume Support** - Skip existing files, continue interrupted downloads
 - **Atomic Resume/Backup** - Status file is always safely backed up and updated, works on Windows, Mac, Linux
 - **Multiple Quality Options** - Choose video quality (720p, 1080p, etc.)
+- **Subtitle Downloads** - Automatically grab Wistia caption tracks in multiple languages
 - **Comprehensive Logging** - Debug mode for troubleshooting
 
 ### 🛡️ **Safety & Compliance**
@@ -201,6 +203,7 @@ RATE_LIMIT_MB_S=            # Rate limit in MB/s (empty = unlimited)
 VALIDATE_DOWNLOADS=true     # Enable file integrity validation
 RESUME_PARTIAL=true         # Enable resume for partial downloads
 DEBUG=false                 # Enable debug logging
+SUBTITLE_DOWNLOAD_ENABLED=true # Download subtitles/captions when available
 
 # ===============================================
 # ADVANCED SETTINGS

diff --git a/thinkific_downloader/__main__.py b/thinkific_downloader/__main__.py
@@ -1,10 +1,123 @@
 #!/usr/bin/env python3
 """
-Command line entry point for Thinkific Downloader
+Command line entry point for Thinkific Downloader and offline site generator.
+
+Usage examples:
+  python -m thinkific_downloader <course_url>
+  python -m thinkific_downloader --json beginner-course.json
+  python -m thinkific_downloader generate-site beginner-course.json --clean
 """
 
+from __future__ import annotations
+
+import argparse
 import sys
-from thinkific_downloader.downloader import main
+from pathlib import Path
+from typing import List, Optional
+
+from thinkific_downloader.downloader import main as downloader_main
+from thinkific_downloader.site_generator import (
+    SiteGenerationError,
+    generate_site,
+    load_course,
+)
+
+# Note: keep console output lightweight so it mirrors existing downloader UX.
+
+
+def _run_generate_site(argv: List[str]) -> int:
+    parser = argparse.ArgumentParser(
+        prog="thinkific_downloader generate-site",
+        description="Validate downloaded Thinkific course assets and build an offline viewer.",
+    )
+    parser.add_argument(
+        "metadata",
+        help="Path to the course metadata JSON file (e.g., beginner-chess-mastery.json).",
+    )
+    parser.add_argument(
+        "--downloads-dir",
+        dest="downloads_dir",
+        help="Override the downloads root directory (defaults to <metadata>/../downloads).",
+    )
+    parser.add_argument(
+        "--output-dir",
+        dest="output_dir",
+        help="Directory to write the generated site (defaults to downloads/<course-slug>/).",
+    )
+    parser.add_argument(
+        "--assets-dirname",
+        dest="assets_dirname",
+        default="site-assets",
+        help="Subdirectory name for bundled CSS/JS assets (default: site-assets).",
+    )
+    parser.add_argument(
+        "--clean",
+        action="store_true",
+        help="Remove previously generated site files before rendering.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Validate metadata and assets without writing any files.",
+    )
+    parser.add_argument(
+        "-q",
+        "--quiet",
+        action="store_true",
+        help="Suppress success output; errors will still be printed.",
+    )
+
+    args = parser.parse_args(argv)
+
+    metadata_path = Path(args.metadata).expanduser()
+    downloads_dir: Optional[Path] = None
+    output_dir: Optional[Path] = None
+
+    if args.downloads_dir:
+        downloads_dir = Path(args.downloads_dir).expanduser()
+    if args.output_dir:
+        output_dir = Path(args.output_dir).expanduser()
+
+    try:
+        if args.dry_run:
+            load_course(metadata_path, downloads_root=downloads_dir)
+            if not args.quiet:
+                print("✅ Course assets validated (dry run).")
+            return 0
+
+        generated_index = generate_site(
+            metadata_path,
+            downloads_root=downloads_dir,
+            output_dir=output_dir,
+            clean=args.clean,
+            assets_dirname=args.assets_dirname,
+        )
+        if not args.quiet:
+            print(f"✅ Offline course generated: {generated_index}")
+        return 0
+
+    except SiteGenerationError as exc:
+        print("✖ Site generation failed:")
+        for error in exc.errors:
+            print(f"  - {error}")
+        return 1
+    except FileNotFoundError as exc:
+        print(f"✖ {exc}")
+        return 1
+    except Exception as exc:  # pragma: no cover - unexpected edge cases
+        print(f"✖ Unexpected error: {exc}")
+        return 1
+
+
+def main(argv: Optional[List[str]] = None) -> None:
+    argv = argv or sys.argv
+    if len(argv) > 1 and argv[1] in {"generate-site", "generate_site"}:
+        exit_code = _run_generate_site(argv[2:])
+        sys.exit(exit_code)
+
+    # Fallback to the legacy downloader behaviour.
+    downloader_main(argv)
+
 
 if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
diff --git a/thinkific_downloader/config.py b/thinkific_downloader/config.py
@@ -37,6 +37,7 @@ class Settings:
     resume_partial: bool = True
     debug: bool = False
     course_name: str = "Course"
+    subtitle_download_enabled: bool = True
 
     @classmethod
     def from_env(cls):
@@ -67,6 +68,7 @@ def from_env(cls):
         validate_downloads = os.getenv('VALIDATE_DOWNLOADS', 'true').lower() in ('1', 'true', 'yes', 'on')
         resume_partial = os.getenv('RESUME_PARTIAL', 'true').lower() in ('1', 'true', 'yes', 'on')
         debug = os.getenv('DEBUG', 'false').lower() in ('1', 'true', 'yes', 'on')
+        subtitle_download_enabled = os.getenv('SUBTITLE_DOWNLOAD_ENABLED', 'true').lower() in ('1', 'true', 'yes', 'on')
 
         # Clean cookie data to remove Unicode characters that cause encoding issues
         if cookie_data:
@@ -101,5 +103,6 @@ def from_env(cls):
             download_delay=download_delay,
             validate_downloads=validate_downloads,
             resume_partial=resume_partial,
-            debug=debug
+            debug=debug,
+            subtitle_download_enabled=subtitle_download_enabled
         )
diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py
@@ -382,10 +382,74 @@ def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1):
     add_download_task(src_url, dst_path, "file")
 
 
+def _load_cached_progress(cache_file: Path):
+    """Return previously analyzed chapters and queued tasks from the resume cache."""
+    analyzed_chapters = set()
+    saved_tasks: List[Dict[str, Any]] = []
+
+    if not cache_file.exists():
+        return analyzed_chapters, saved_tasks
+
+    try:
+        with open(cache_file, 'r', encoding='utf-8') as f:
+            cache_data = json.load(f)
+
+        analyzed_chapters = set(cache_data.get('analyzed_chapters', []))
+        saved_tasks = cache_data.get('download_tasks', [])
+        print(f"📋 Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached")
+
+        # If subtitle downloads were newly enabled, invalidate cache so we can regenerate tasks.
+        if SETTINGS and SETTINGS.subtitle_download_enabled and saved_tasks:
+            has_subtitle_tasks = any(
+                (task.get('content_type') or '').lower() == 'subtitle'
+                for task in saved_tasks
+            )
+            if not has_subtitle_tasks:
+                print("🆕 Subtitle support enabled — refreshing cached analysis to include captions.")
+                analyzed_chapters = set()
+                saved_tasks = []
+                try:
+                    cache_file.unlink()
+                except OSError as exc:
+                    print(f"   ⚠️  Warning: Failed to delete cache file for refresh: {exc}")
+    except (json.JSONDecodeError, OSError):
+        analyzed_chapters = set()
+        saved_tasks = []
+
+    return analyzed_chapters, saved_tasks
+
+
+def _restore_saved_tasks(saved_tasks: List[Dict[str, Any]]):
+    """Restore cached download tasks, respecting the subtitle feature flag."""
+    if not saved_tasks:
+        return
+
+    restored_tasks = list(saved_tasks)
+    if SETTINGS and not SETTINGS.subtitle_download_enabled:
+        total_tasks = len(restored_tasks)
+        restored_tasks = [
+            task for task in restored_tasks
+            if (task.get('content_type') or 'video').lower() != 'subtitle'
+        ]
+        skipped_count = total_tasks - len(restored_tasks)
+        if skipped_count > 0:
+            print(f"⏭️  Skipping {skipped_count} cached subtitle task(s) because subtitle downloads are disabled.")
+
+    if not restored_tasks:
+        return
+
+    print(f"📥 Restoring {len(restored_tasks)} previously collected download tasks...")
+    for task_data in restored_tasks:
+        add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video'))
+
+
 
 def init_course(data: Dict[str, Any]):
     """Initialize course structure and collect ALL download tasks first."""
     global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST, DOWNLOAD_TASKS
+
+    # Ensure settings/download manager are initialized so feature flags are available
+    init_settings()
 
     # Initialize download tasks list
     DOWNLOAD_TASKS = []
@@ -409,17 +473,7 @@ def init_course(data: Dict[str, Any]):
     analyzed_chapters = set()
     saved_tasks = []
 
-    if cache_file.exists():
-        try:
-            import json
-            with open(cache_file, 'r', encoding='utf-8') as f:
-                cache_data = json.load(f)
-                analyzed_chapters = set(cache_data.get('analyzed_chapters', []))
-                saved_tasks = cache_data.get('download_tasks', [])
-                print(f"📋 Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached")
-        except:
-            analyzed_chapters = set()
-            saved_tasks = []
+    analyzed_chapters, saved_tasks = _load_cached_progress(cache_file)
 
     # Derive base host from landing_page_url if available
     landing = data['course'].get('landing_page_url')
@@ -430,10 +484,7 @@ def init_course(data: Dict[str, Any]):
     print("\n🔍 Phase 1: Analyzing course content and collecting download links...")
 
     # Restore saved download tasks
-    if saved_tasks:
-        print(f"📥 Restoring {len(saved_tasks)} previously collected download tasks...")
-        for task_data in saved_tasks:
-            add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video'))
+    _restore_saved_tasks(saved_tasks)
 
     collect_all_download_tasks(data, analyzed_chapters, cache_file)
 
@@ -835,9 +886,24 @@ def collect_video_task_wistia(wistia_id: str, file_name: str, dest_dir: Path):
             video_url = selected.get('url')
             if video_url:
                 ext = '.mp4'  # Default extension
-                resolved_name = filter_filename(file_name) + ext
+                resolved_name = filter_filename(file_name)
+                if not resolved_name.lower().endswith(ext):
+                    resolved_name += ext
                 print(f"   📹 Found video: {resolved_name}")
                 add_download_task(video_url, dest_dir / resolved_name, "video")
+                try:
+                    from .wistia_downloader import build_wistia_subtitle_tasks
+                    subtitle_tasks = build_wistia_subtitle_tasks(
+                        data.get('media') or {},
+                        dest_dir,
+                        resolved_name,
+                        SETTINGS,
+                    )
+                    for task in subtitle_tasks:
+                        print(f"   [Subs] Queued subtitles: {Path(task['dest_path']).name}")
+                        add_download_task(task['url'], Path(task['dest_path']), task.get('content_type', 'subtitle'))
+                except Exception as subtitle_error:
+                    print(f"   ⚠️  Unable to queue subtitles for {resolved_name}: {subtitle_error}")
     except Exception as e:
         print(f"   ❌ Failed to collect Wistia video {wistia_id}: {e}")
 
@@ -1282,4 +1348,4 @@ def main(argv: List[str]):
 
 
 if __name__ == '__main__':
-    main(sys.argv)
+    main(sys.argv)