-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtts.py
151 lines (121 loc) · 5.1 KB
/
tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from TTS.api import TTS
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
import numpy as np
import os
import tempfile
import shutil
import soundfile as sf
from scipy import signal
def ensure_temp_dir():
"""Create directory for temporary files"""
temp_dir = os.path.join(os.getcwd(), 'temp_audio')
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
return temp_dir
def cleanup_temp_dir(temp_dir):
"""Clean up temporary files and directory"""
try:
shutil.rmtree(temp_dir)
print("Geçici dosyalar temizlendi")
except Exception as e:
print(f"Geçici dosyaları temizlerken hata: {str(e)}")
def calculate_speech_speed(text, scene_duration):
"""Calculate speech speed based on scene duration"""
WORDS_PER_SECOND = 2.5
word_count = len(text.split())
normal_duration = word_count / WORDS_PER_SECOND
target_duration = scene_duration * 0.9
if normal_duration > target_duration:
return normal_duration / target_duration
else:
return 1.0
def speed_up_audio_file(input_file, output_file, speed_factor):
"""Speed up audio file"""
if speed_factor <= 1.0:
return input_file
# Read audio file
data, samplerate = sf.read(input_file)
# Calculate new length
new_length = int(len(data) / speed_factor)
# Resample audio data
if len(data.shape) == 2: # Stereo
resampled = np.zeros((new_length, 2))
for channel in range(2):
resampled[:, channel] = signal.resample(data[:, channel], new_length)
else: # Mono
resampled = signal.resample(data, new_length)
# Save new audio file
sf.write(output_file, resampled, samplerate)
return output_file
def create_video_with_tts(video_path, scene_descriptions, scene_times, output_path=None, lang='en'):
"""Create video with text-to-speech narration"""
try:
print("Creating voiced video...")
temp_dir = ensure_temp_dir()
# TTS models for English and Turkish only
model_map = {
'tr': "tts_models/tr/common-voice/glow-tts",
'en': "tts_models/en/ljspeech/glow-tts"
}
model_name = model_map.get(lang, model_map['en'])
tts = TTS(model_name=model_name, progress_bar=False)
print(f'Loaded TTS model: {model_name}')
# Load main video
video = VideoFileClip(video_path).set_audio(None)
audio_clips = []
for i, description in enumerate(scene_descriptions):
try:
# Calculate scene duration
current_time = scene_times[i]
next_time = scene_times[i + 1] if i < len(scene_times) - 1 else video.duration
scene_duration = next_time - current_time
# Create audio file
temp_audio = os.path.join(temp_dir, f'scene_{i}.wav')
# Generate TTS audio
tts.tts_to_file(
text=description,
file_path=temp_audio
)
# Calculate and adjust speech speed
speed = calculate_speech_speed(description, scene_duration)
if speed > 1.0:
fast_audio = os.path.join(temp_dir, f'scene_{i}_fast.wav')
temp_audio = speed_up_audio_file(temp_audio, fast_audio, speed)
# Create and position audio clip
audio_clip = AudioFileClip(temp_audio)
audio_clip = audio_clip.set_start(current_time)
# Fit audio duration to scene
if audio_clip.duration > scene_duration:
audio_clip = audio_clip.set_duration(scene_duration)
audio_clips.append(audio_clip)
print(f"Scene {i+1} audio created (duration: {scene_duration:.1f}s, speed: {speed:.1f}x)")
except Exception as e:
print(f"Error creating audio for scene {i+1}: {str(e)}")
continue
if not audio_clips:
raise Exception("No audio clips were created!")
# Combine audio and add to video
final_audio = CompositeAudioClip(audio_clips)
final_video = video.set_audio(final_audio)
if output_path is None:
base_name = os.path.splitext(video_path)[0]
output_path = f"{base_name}_with_tts.mp4"
# Save video
final_video.write_videofile(
output_path,
codec='libx264',
audio_codec='aac',
fps=video.fps
)
print(f"Video with TTS saved: {output_path}")
# Clean up resources
video.close()
final_video.close()
for clip in audio_clips:
clip.close()
cleanup_temp_dir(temp_dir)
return output_path
except Exception as e:
if temp_dir:
cleanup_temp_dir(temp_dir)
raise Exception(f"Error creating TTS video: {str(e)}")