Skip to content

Commit

Permalink
lang options for captions downloader
Browse files Browse the repository at this point in the history
  • Loading branch information
nbonamy committed Jan 21, 2024
1 parent 4e5b69e commit 5d5939b
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ cython_debug/
.DS_Store

# our stuff
captions/
captions*/
db/
*.json
*.conf
Expand Down
6 changes: 5 additions & 1 deletion src/download_captions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
import os
import sys
import json
import html
from downloader import Downloader
Expand All @@ -11,6 +12,9 @@ def main():
if not os.path.exists('captions'):
os.mkdir('captions')

# lang
lang = None if len(sys.argv) == 1 else sys.argv[1]

videos = json.load(open('videos.json'))
for video in videos:

Expand All @@ -26,7 +30,7 @@ def main():
original = open(f'captions/{id}.original.vtt', 'r').read()
else:
print(f'[youtube] downloading captions for {id}: {title}...')
original = downloader.download_captions(id)
original = downloader.download_captions(id, lang)
with open(f'captions/{id}.original.vtt', 'w') as f:
f.write(original)

Expand Down
2 changes: 1 addition & 1 deletion src/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _cleanup_captions(self, original_captions):
# remove header lines
contents = re.sub(r'WEBVTT\n', '', contents)
contents = re.sub(r'Kind: captions\n', '', contents)
contents = re.sub(r'Language: en\n', '', contents)
contents = re.sub(r'Language: .*?\n', '', contents)

# remove timestamp lines
contents = re.sub(r'\d\d:\d\d:\d\d\.\d\d\d --> .*\n', '', contents)
Expand Down

0 comments on commit 5d5939b

Please sign in to comment.