CLI Options; Console Log and Logging; Message file

simonbermudez · Aug 26, 2023 · 0a3d8ea · 0a3d8ea
1 parent 62a0b7d
commit 0a3d8ea
Show file tree

Hide file tree

Showing 3 changed files with 158 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -78,6 +78,62 @@ To embark on your Whisper-TikTok journey, initiate the following command within
 python main.py
 ```
 
+## Command-Line Options
+
+Whisper-TikTok supports the following command-line options:
+
+```bash
+python main.py [OPTIONS]
+
+Options:
+  --model TEXT        Model to use
+                      [tiny|base|small|medium|large] (Default: small)
+  --non_english       Don't use the English model. (Flag)
+  --url TEXT          YouTube URL to download as background video.
+                      (Default: <https://www.youtube.com/watch?v=intRX7BRA90>)
+  --tts TEXT          Voice to use for TTS (Default: en-US-ChristopherNeural)
+  --list-voices       Use `edge-tts --list-voices` to list all voices.
+  --random_voice      Random voice for TTS (Flag)
+  --gender TEXT       Gender of the random TTS voice [Male|Female].
+  --language TEXT     Language of the random TTS voice
+                      (e.g., en-US)
+  -v, --verbose       Verbose (Flag)
+```
+
+> If you use the --random_voice option, please specify both --gender and --language arguments. Also you will need to specify the --non_english argument if you want to use a non-English voice otherwise the program will use the English model. Whisper model will auto-detect the language of the audio file and use the corresponding model.
+
+## Usage Examples
+
+- Generate a TikTok video using a specific TTS model and voice:
+
+```bash
+python main.py --model medium --tts en-US-EricNeural
+```
+
+- Generate a TikTok video without using the English model:
+
+```bash
+python main.py --non_english --tts de-DE-KillianNeural
+```
+
+- Use a custom YouTube video as the background video:
+
+```bash
+python main.py --url https://www.youtube.com/watch?v=dQw4w9WgXcQ --tts en-US-JennyNeural
+```
+
+- Generate a TikTok video with a random TTS voice:
+
+```bash
+python main.py --random_voice --gender Male --language en-US
+```
+
+- List all available voices:
+
+```bash
+edge-tts --list-voices
+```
+
 ## Code of Conduct
 
 Please review our [Code of Conduct](./CODE_OF_CONDUCT.md) before contributing to Whisper-TikTok.

diff --git a/code/main.py b/code/main.py
@@ -10,6 +10,7 @@
 import logging
 from typing import Tuple
 import datetime
+import argparse
 
 # PyTorch
 import torch
@@ -31,6 +32,9 @@
 # utils.py
 from utils import *
 
+# msg.py
+import msg
+
 HOME = os.getcwd()
 
 # Logging
@@ -63,30 +67,77 @@
 
 
 async def main() -> bool:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="small", help="Model to use",
+                        choices=["tiny", "base", "small", "medium", "large"], type=str)
+    parser.add_argument("--non_english", action='store_true',
+                        help="Don't use the english model.")
+    parser.add_argument("--url", metavar='U', default="https://www.youtube.com/watch?v=intRX7BRA90",
+                        help="Youtube URL to download as background video.", type=str)
+    parser.add_argument("--tts", default="en-US-ChristopherNeural",
+                        help="Voice to use for TTS", type=str)
+    parser.add_argument(
+        "--list-voices", help="Use `edge-tts --list-voices` to list all voices", action='help')
+    parser.add_argument("--random_voice", action='store_true',
+                        help="Random voice for TTS", default=False)
+    parser.add_argument("--gender", choices=["Male", "Female"],
+                        help="Gender of the random TTS voice", type=str)
+    parser.add_argument(
+        "--language", help="Language of the random TTS voice for example: en-US", type=str)
+    parser.add_argument("-v", "--verbose", action='store_true',
+                        help="Verbose")
+    args = parser.parse_args()
+
+    if args.random_voice:
+        args.tts = None
+        if not args.gender or not args.language:
+            console.log(
+                f"{msg.ERROR}When using --random_voice, please specify both --gender and --language arguments.")
+            sys.exit(1)
+
+        else:
+            voices = await VoicesManager.create()
+            voices = voices.find(Gender=args.gender, Locale=args.language)
+            if len(voices) == 0:
+                # Locale not found
+                console.log(
+                    f"{msg.ERROR}Specified TTS language not found. Make sure you are using the correct format. For example: en-US")
+                sys.exit(1)
+
+            # Check if language is english
+            if not str(args.language).startswith('en'):
+                args.non_english = True
+
     # Clear terminal
     console.clear()
 
-    logging.debug('Creating video')
-    with console.status("[bold cyan]Creating video...") as status:
+    logger.debug('Creating video')
+    with console.status(msg.STATUS) as status:
         load_dotenv(find_dotenv())  # Optional
+
         console.log(
-            f"| [green]OK[/green] | Finish loading environment variables")
-        logging.info('Finish loading environment variables')
+            f"{msg.OK}Finish loading environment variables")
+        logger.info('Finish loading environment variables')
 
         # Check if GPU is available for PyTorch (CUDA).
         if torch.cuda.is_available():
-            console.log(f"| [green]OK[/green] | PyTorch GPU version found")
-            logging.info('PyTorch GPU version found')
+            console.log(f"{msg.OK}PyTorch GPU version found")
+            logger.info('PyTorch GPU version found')
         else:
             console.log(
-                f"| [yellow][WARNING][/yellow] | PyTorch GPU not found, using CPU instead")
-            logging.warning('PyTorch GPU not found')
+                f"{msg.WARNING}PyTorch GPU not found, using CPU instead")
+            logger.warning('PyTorch GPU not found')
 
-        download_video(url='https://www.youtube.com/watch?v=intRX7BRA90')
+        download_video(url=args.url)
 
-        model = whisper.load_model("small.en")
-        console.log(f"| [green]OK[/green] | OpenAI-Whisper model loaded")
-        logging.info('OpenAI-Whisper model loaded')
+        # OpenAI-Whisper Model
+        model = args.model
+        if args.model != "large" and not args.non_english:
+            model = args.model + ".en"
+        whisper_model = whisper.load_model(model)
+
+        console.log(f"{msg.OK}OpenAI-Whisper model loaded")
+        logger.info('OpenAI-Whisper model loaded')
 
         # Text 2 Speech (Edge TTS API)
         for video_id, video in enumerate(jsonData):
@@ -99,34 +150,34 @@ async def main() -> bool:
             req_text, filename = create_full_text(
                 path, series, part, text, outro)
 
-            console.log(f"| [green]OK[/green] | Text converted successfully")
-            logging.info('Text converted successfully')
+            console.log(f"{msg.OK}Text converted successfully")
+            logger.info('Text converted successfully')
 
-            await tts(req_text, outfile=filename)
+            await tts(req_text, outfile=filename, voice=args.tts, random_voice=args.random_voice, args=args)
 
             console.log(
-                f"| [green]OK[/green] | Text2Speech mp3 file generated successfully!")
-            logging.info('Text2Speech mp3 file generated successfully!')
+                f"{msg.OK}Text2Speech mp3 file generated successfully!")
+            logger.info('Text2Speech mp3 file generated successfully!')
 
             # Whisper Model to create SRT file from Speech recording
             srt_filename = srt_create(
-                model, path, series, part, text, filename)
+                whisper_model, path, series, part, text, filename)
 
             console.log(
-                f"| [green]OK[/green] | Transcription srt and ass file saved successfully!")
-            logging.info('Transcription srt and ass file saved successfully!')
+                f"{msg.OK}Transcription srt and ass file saved successfully!")
+            logger.info('Transcription srt and ass file saved successfully!')
 
             # Background video with srt and duration
             background_mp4 = random_background()
-            file_info = get_info(background_mp4)
+            file_info = get_info(background_mp4, verbose=args.verbose)
             final_video = prepare_background(
-                background_mp4, filename_mp3=filename, filename_srt=srt_filename, duration=int(file_info.get('duration')))
+                background_mp4, filename_mp3=filename, filename_srt=srt_filename, duration=int(file_info.get('duration')), verbose=args.verbose)
 
             console.log(
-                f"| [green]OK[/green] | MP4 video saved successfully!\nPath: {final_video}")
-            logging.info(f'MP4 video saved successfully!\nPath: {final_video}')
+                f"{msg.OK}MP4 video saved successfully!\nPath: {final_video}")
+            logger.info(f'MP4 video saved successfully!\nPath: {final_video}')
 
-    console.log(f'[bold][red]Done![/red][/bold]')
+    console.log(f'{msg.DONE}')
     return True
 
 
@@ -138,8 +189,8 @@ def download_video(url: str, folder: str = 'background'):
         with subprocess.Popen(['yt-dlp', '--restrict-filenames', '--merge-output-format', 'mp4', url]) as process:
             pass
         console.log(
-            f"| [green]OK[/green] | Background video downloaded successfully")
-        logging.info('Background video downloaded successfully')
+            f"{msg.OK}Background video downloaded successfully")
+        logger.info('Background video downloaded successfully')
     return
 
 
@@ -165,24 +216,24 @@ def get_info(filename: str, verbose: bool = False):
             except Exception:
                 if verbose:
                     console.log(
-                        f"| [yellow][WARNING][/yellow] | MP4 default metadata not found")
-                    logging.warning('MP4 default metadata not found')
+                        f"{msg.WARNING}MP4 default metadata not found")
+                    logger.warning('MP4 default metadata not found')
                 duration = (datetime.datetime.strptime(
                     audio_stream['DURATION'], '%H:%M:%S.%f') - datetime.datetime.min).total_seconds()
             if video_stream is None:
                 if verbose:
                     console.log(
-                        f"| [yellow][WARNING][/yellow] | No video stream found")
-                    logging.warning('No video stream found')
+                        f"{msg.WARNING}No video stream found")
+                    logger.warning('No video stream found')
                 bit_rate = int(audio_stream['bit_rate'])
                 return {'bit_rate': bit_rate, 'duration': duration}
 
             width = int(video_stream['width'])
             height = int(video_stream['height'])
             return {'width': width, 'height': height, 'duration': duration}
     except ffmpeg.Error as e:
-        rich_print(f"[ERROR] {e.stderr}", style="bold red")
-        logging.critical(e.stderr)
+        console.log(f"{msg.ERROR}{e.stderr}")
+        logger.exception(e.stderr)
         sys.exit(1)
 
 
@@ -242,7 +293,8 @@ def srt_create(model, path: str, series: str, part: int, text: str, filename: st
         bool: A boolean indicating whether the creation of the .srt file was successful or not.
 
     """
-    transcribe = model.transcribe(filename, regroup=True)
+    transcribe = model.transcribe(
+        filename, regroup=True, fp16=torch.cuda.is_available())
     transcribe.split_by_gap(0.5).split_by_length(
         38).merge_by_gap(0.15, max_words=2)
     series = series.replace(' ', '_')
@@ -329,7 +381,7 @@ def create_full_text(path: str = '', series: str = '', part: int = 1, text: str
     return req_text, filename
 
 
-async def tts(final_text: str, voice: str = "en-US-ChristopherNeural", random_voice: bool = False, stdout: bool = False, outfile: str = "tts.mp3") -> bool:
+async def tts(final_text: str, voice: str = "en-US-ChristopherNeural", random_voice: bool = False, stdout: bool = False, outfile: str = "tts.mp3", args=None) -> bool:
     """
     Tts is an asynchronous function that takes in four arguments: a final text string, a voice string, a boolean value for random voice selection, a boolean value to indicate if output should be directed to standard output or not, and a filename string for the output file. The function uses Microsoft Azure Cognitive Services to synthesize speech from the input text using the specified voice, and saves the output to a file or prints it to the console.
 
@@ -346,23 +398,29 @@ async def tts(final_text: str, voice: str = "en-US-ChristopherNeural", random_vo
     """
     voices = await VoicesManager.create()
     if random_voice:
-        voices = voices.find(Gender="Male", Locale="en-US")
+        voices = voices.find(Gender=args.gender, Locale=args.language)
         voice = random.choice(voices)["Name"]
     communicate = edge_tts.Communicate(final_text, voice)
     if not stdout:
         await communicate.save(outfile)
     return True
 
 if __name__ == "__main__":
+
     if platform.system() == 'Windows':
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+
     loop = asyncio.get_event_loop()
+
     try:
         loop.run_until_complete(main())
+
     except Exception as e:
-        rich_print(
-            f"[ERROR] {type(e).__name__} at line {e.__traceback__.tb_lineno} of {__file__}: {e}", style="bold red")
         loop.close()
-        sys.exit(1)
+        console.log(f"{msg.ERROR}{e}")
+        logger.exception(e)
+
     finally:
         loop.close()
+
+    sys.exit(1)
diff --git a/code/msg.py b/code/msg.py
@@ -0,0 +1,5 @@
+STATUS = "[bold cyan]Creating video... | "
+OK = "| [green]OK[/green] | "
+WARNING = "| [yellow][WARNING][/yellow] | "
+DONE = "[bold][red]Done![/red][/bold] "
+ERROR = "| [bold red]ERROR[/bold red] | "