Skip to content

Commit

Permalink
CLI Options; Console Log and Logging; Message file
Browse files Browse the repository at this point in the history
  • Loading branch information
MatteoFasulo committed Aug 26, 2023
1 parent 62a0b7d commit 0a3d8ea
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 39 deletions.
56 changes: 56 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,62 @@ To embark on your Whisper-TikTok journey, initiate the following command within
python main.py
```

## Command-Line Options

Whisper-TikTok supports the following command-line options:

```bash
python main.py [OPTIONS]

Options:
--model TEXT Model to use
[tiny|base|small|medium|large] (Default: small)
--non_english Don't use the English model. (Flag)
--url TEXT YouTube URL to download as background video.
(Default: <https://www.youtube.com/watch?v=intRX7BRA90>)
--tts TEXT Voice to use for TTS (Default: en-US-ChristopherNeural)
--list-voices Use `edge-tts --list-voices` to list all voices.
--random_voice Random voice for TTS (Flag)
--gender TEXT Gender of the random TTS voice [Male|Female].
--language TEXT Language of the random TTS voice
(e.g., en-US)
-v, --verbose Verbose (Flag)
```
> If you use the --random_voice option, please specify both --gender and --language arguments. Also you will need to specify the --non_english argument if you want to use a non-English voice otherwise the program will use the English model. Whisper model will auto-detect the language of the audio file and use the corresponding model.
## Usage Examples
- Generate a TikTok video using a specific TTS model and voice:
```bash
python main.py --model medium --tts en-US-EricNeural
```
- Generate a TikTok video without using the English model:
```bash
python main.py --non_english --tts de-DE-KillianNeural
```
- Use a custom YouTube video as the background video:
```bash
python main.py --url https://www.youtube.com/watch?v=dQw4w9WgXcQ --tts en-US-JennyNeural
```
- Generate a TikTok video with a random TTS voice:
```bash
python main.py --random_voice --gender Male --language en-US
```
- List all available voices:
```bash
edge-tts --list-voices
```
## Code of Conduct
Please review our [Code of Conduct](./CODE_OF_CONDUCT.md) before contributing to Whisper-TikTok.
Expand Down
136 changes: 97 additions & 39 deletions code/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import logging
from typing import Tuple
import datetime
import argparse

# PyTorch
import torch
Expand All @@ -31,6 +32,9 @@
# utils.py
from utils import *

# msg.py
import msg

HOME = os.getcwd()

# Logging
Expand Down Expand Up @@ -63,30 +67,77 @@


async def main() -> bool:
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="small", help="Model to use",
choices=["tiny", "base", "small", "medium", "large"], type=str)
parser.add_argument("--non_english", action='store_true',
help="Don't use the english model.")
parser.add_argument("--url", metavar='U', default="https://www.youtube.com/watch?v=intRX7BRA90",
help="Youtube URL to download as background video.", type=str)
parser.add_argument("--tts", default="en-US-ChristopherNeural",
help="Voice to use for TTS", type=str)
parser.add_argument(
"--list-voices", help="Use `edge-tts --list-voices` to list all voices", action='help')
parser.add_argument("--random_voice", action='store_true',
help="Random voice for TTS", default=False)
parser.add_argument("--gender", choices=["Male", "Female"],
help="Gender of the random TTS voice", type=str)
parser.add_argument(
"--language", help="Language of the random TTS voice for example: en-US", type=str)
parser.add_argument("-v", "--verbose", action='store_true',
help="Verbose")
args = parser.parse_args()

if args.random_voice:
args.tts = None
if not args.gender or not args.language:
console.log(
f"{msg.ERROR}When using --random_voice, please specify both --gender and --language arguments.")
sys.exit(1)

else:
voices = await VoicesManager.create()
voices = voices.find(Gender=args.gender, Locale=args.language)
if len(voices) == 0:
# Locale not found
console.log(
f"{msg.ERROR}Specified TTS language not found. Make sure you are using the correct format. For example: en-US")
sys.exit(1)

# Check if language is english
if not str(args.language).startswith('en'):
args.non_english = True

# Clear terminal
console.clear()

logging.debug('Creating video')
with console.status("[bold cyan]Creating video...") as status:
logger.debug('Creating video')
with console.status(msg.STATUS) as status:
load_dotenv(find_dotenv()) # Optional

console.log(
f"| [green]OK[/green] | Finish loading environment variables")
logging.info('Finish loading environment variables')
f"{msg.OK}Finish loading environment variables")
logger.info('Finish loading environment variables')

# Check if GPU is available for PyTorch (CUDA).
if torch.cuda.is_available():
console.log(f"| [green]OK[/green] | PyTorch GPU version found")
logging.info('PyTorch GPU version found')
console.log(f"{msg.OK}PyTorch GPU version found")
logger.info('PyTorch GPU version found')
else:
console.log(
f"| [yellow][WARNING][/yellow] | PyTorch GPU not found, using CPU instead")
logging.warning('PyTorch GPU not found')
f"{msg.WARNING}PyTorch GPU not found, using CPU instead")
logger.warning('PyTorch GPU not found')

download_video(url='https://www.youtube.com/watch?v=intRX7BRA90')
download_video(url=args.url)

model = whisper.load_model("small.en")
console.log(f"| [green]OK[/green] | OpenAI-Whisper model loaded")
logging.info('OpenAI-Whisper model loaded')
# OpenAI-Whisper Model
model = args.model
if args.model != "large" and not args.non_english:
model = args.model + ".en"
whisper_model = whisper.load_model(model)

console.log(f"{msg.OK}OpenAI-Whisper model loaded")
logger.info('OpenAI-Whisper model loaded')

# Text 2 Speech (Edge TTS API)
for video_id, video in enumerate(jsonData):
Expand All @@ -99,34 +150,34 @@ async def main() -> bool:
req_text, filename = create_full_text(
path, series, part, text, outro)

console.log(f"| [green]OK[/green] | Text converted successfully")
logging.info('Text converted successfully')
console.log(f"{msg.OK}Text converted successfully")
logger.info('Text converted successfully')

await tts(req_text, outfile=filename)
await tts(req_text, outfile=filename, voice=args.tts, random_voice=args.random_voice, args=args)

console.log(
f"| [green]OK[/green] | Text2Speech mp3 file generated successfully!")
logging.info('Text2Speech mp3 file generated successfully!')
f"{msg.OK}Text2Speech mp3 file generated successfully!")
logger.info('Text2Speech mp3 file generated successfully!')

# Whisper Model to create SRT file from Speech recording
srt_filename = srt_create(
model, path, series, part, text, filename)
whisper_model, path, series, part, text, filename)

console.log(
f"| [green]OK[/green] | Transcription srt and ass file saved successfully!")
logging.info('Transcription srt and ass file saved successfully!')
f"{msg.OK}Transcription srt and ass file saved successfully!")
logger.info('Transcription srt and ass file saved successfully!')

# Background video with srt and duration
background_mp4 = random_background()
file_info = get_info(background_mp4)
file_info = get_info(background_mp4, verbose=args.verbose)
final_video = prepare_background(
background_mp4, filename_mp3=filename, filename_srt=srt_filename, duration=int(file_info.get('duration')))
background_mp4, filename_mp3=filename, filename_srt=srt_filename, duration=int(file_info.get('duration')), verbose=args.verbose)

console.log(
f"| [green]OK[/green] | MP4 video saved successfully!\nPath: {final_video}")
logging.info(f'MP4 video saved successfully!\nPath: {final_video}')
f"{msg.OK}MP4 video saved successfully!\nPath: {final_video}")
logger.info(f'MP4 video saved successfully!\nPath: {final_video}')

console.log(f'[bold][red]Done![/red][/bold]')
console.log(f'{msg.DONE}')
return True


Expand All @@ -138,8 +189,8 @@ def download_video(url: str, folder: str = 'background'):
with subprocess.Popen(['yt-dlp', '--restrict-filenames', '--merge-output-format', 'mp4', url]) as process:
pass
console.log(
f"| [green]OK[/green] | Background video downloaded successfully")
logging.info('Background video downloaded successfully')
f"{msg.OK}Background video downloaded successfully")
logger.info('Background video downloaded successfully')
return


Expand All @@ -165,24 +216,24 @@ def get_info(filename: str, verbose: bool = False):
except Exception:
if verbose:
console.log(
f"| [yellow][WARNING][/yellow] | MP4 default metadata not found")
logging.warning('MP4 default metadata not found')
f"{msg.WARNING}MP4 default metadata not found")
logger.warning('MP4 default metadata not found')
duration = (datetime.datetime.strptime(
audio_stream['DURATION'], '%H:%M:%S.%f') - datetime.datetime.min).total_seconds()
if video_stream is None:
if verbose:
console.log(
f"| [yellow][WARNING][/yellow] | No video stream found")
logging.warning('No video stream found')
f"{msg.WARNING}No video stream found")
logger.warning('No video stream found')
bit_rate = int(audio_stream['bit_rate'])
return {'bit_rate': bit_rate, 'duration': duration}

width = int(video_stream['width'])
height = int(video_stream['height'])
return {'width': width, 'height': height, 'duration': duration}
except ffmpeg.Error as e:
rich_print(f"[ERROR] {e.stderr}", style="bold red")
logging.critical(e.stderr)
console.log(f"{msg.ERROR}{e.stderr}")
logger.exception(e.stderr)
sys.exit(1)


Expand Down Expand Up @@ -242,7 +293,8 @@ def srt_create(model, path: str, series: str, part: int, text: str, filename: st
bool: A boolean indicating whether the creation of the .srt file was successful or not.
"""
transcribe = model.transcribe(filename, regroup=True)
transcribe = model.transcribe(
filename, regroup=True, fp16=torch.cuda.is_available())
transcribe.split_by_gap(0.5).split_by_length(
38).merge_by_gap(0.15, max_words=2)
series = series.replace(' ', '_')
Expand Down Expand Up @@ -329,7 +381,7 @@ def create_full_text(path: str = '', series: str = '', part: int = 1, text: str
return req_text, filename


async def tts(final_text: str, voice: str = "en-US-ChristopherNeural", random_voice: bool = False, stdout: bool = False, outfile: str = "tts.mp3") -> bool:
async def tts(final_text: str, voice: str = "en-US-ChristopherNeural", random_voice: bool = False, stdout: bool = False, outfile: str = "tts.mp3", args=None) -> bool:
"""
Tts is an asynchronous function that takes in four arguments: a final text string, a voice string, a boolean value for random voice selection, a boolean value to indicate if output should be directed to standard output or not, and a filename string for the output file. The function uses Microsoft Azure Cognitive Services to synthesize speech from the input text using the specified voice, and saves the output to a file or prints it to the console.
Expand All @@ -346,23 +398,29 @@ async def tts(final_text: str, voice: str = "en-US-ChristopherNeural", random_vo
"""
voices = await VoicesManager.create()
if random_voice:
voices = voices.find(Gender="Male", Locale="en-US")
voices = voices.find(Gender=args.gender, Locale=args.language)
voice = random.choice(voices)["Name"]
communicate = edge_tts.Communicate(final_text, voice)
if not stdout:
await communicate.save(outfile)
return True

if __name__ == "__main__":

if platform.system() == 'Windows':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

loop = asyncio.get_event_loop()

try:
loop.run_until_complete(main())

except Exception as e:
rich_print(
f"[ERROR] {type(e).__name__} at line {e.__traceback__.tb_lineno} of {__file__}: {e}", style="bold red")
loop.close()
sys.exit(1)
console.log(f"{msg.ERROR}{e}")
logger.exception(e)

finally:
loop.close()

sys.exit(1)
5 changes: 5 additions & 0 deletions code/msg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
STATUS = "[bold cyan]Creating video... | "
OK = "| [green]OK[/green] | "
WARNING = "| [yellow][WARNING][/yellow] | "
DONE = "[bold][red]Done![/red][/bold] "
ERROR = "| [bold red]ERROR[/bold red] | "

0 comments on commit 0a3d8ea

Please sign in to comment.