-
Notifications
You must be signed in to change notification settings - Fork 131
/
Copy pathedge_tts_provider.py
186 lines (155 loc) · 6.44 KB
/
edge_tts_provider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import asyncio
import logging
import math
import io
import edge_tts
from edge_tts import list_voices
from typing import Union
from pydub import AudioSegment
from audiobook_generator.config.general_config import GeneralConfig
from audiobook_generator.core.audio_tags import AudioTags
from audiobook_generator.core.utils import set_audio_tags
from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider
logger = logging.getLogger(__name__)
async def get_supported_voices():
# List all available voices and their attributes.
# This pulls data from the URL used by Microsoft Edge to return a list of
# all available voices.
# Returns:
# dict: A dictionary of voice attributes.
voices = await list_voices()
voices = sorted(voices, key=lambda voice: voice["ShortName"])
result = {}
for voice in voices:
result[voice["ShortName"]] = voice["Locale"]
return result
# Credit: https://gist.github.com/moha-abdi/8ddbcb206c38f592c65ada1e5479f2bf
# @phuchoang2603 contributed pause support in https://github.com/p0n1/epub_to_audiobook/pull/45
class CommWithPauses:
# This class uses edge_tts to generate text
# but with pauses for example:- text: 'Hello
# this is simple text. [pause: 1000] Paused 1000ms'
def __init__(
self,
text: str,
voice_name: str,
break_string: str,
break_duration: int = 1250,
**kwargs,
) -> None:
self.full_text = text
self.voice_name = voice_name
self.break_string = break_string
self.break_duration = int(break_duration)
self.parsed = self.parse_text()
self.file = io.BytesIO()
def parse_text(self):
logger.debug(
f"Parsing the text, looking for break/pauses in text: <{self.full_text}>"
)
if self.break_string not in self.full_text:
logger.debug(f"No break/pauses found in the text")
return [self.full_text]
parts = self.full_text.split(self.break_string)
logger.debug(f"split into <{len(parts)}> parts: {parts}")
return parts
async def chunkify(self):
logger.debug(f"Chunkifying the text")
for content in self.parsed:
logger.debug(f"content from parsed: <{content}>")
audio_bytes = await self.generate_audio(content)
self.file.write(audio_bytes)
if content != self.parsed[-1] and self.break_duration > 0:
# only same break duration for all breaks is supported now
pause_bytes = self.generate_pause(self.break_duration)
self.file.write(pause_bytes)
logger.debug(f"Chunkifying done")
def generate_pause(self, time: int) -> bytes:
logger.debug(f"Generating pause")
# pause time should be provided in ms
silent: AudioSegment = AudioSegment.silent(time, 24000)
return silent.raw_data # type: ignore
async def generate_audio(self, text: str) -> bytes:
logger.debug(f"Generating audio for: <{text}>")
# this genertes the real TTS using edge_tts for this part.
temp_chunk = io.BytesIO()
communicate = edge_tts.Communicate(text, self.voice_name)
async for chunk in communicate.stream():
if chunk["type"] == "audio":
temp_chunk.write(chunk["data"])
temp_chunk.seek(0)
# handle the case where the chunk is empty
try:
logger.debug(f"Decoding the chunk")
decoded_chunk = AudioSegment.from_mp3(temp_chunk)
except Exception as e:
logger.warning(
f"Failed to decode the chunk, reason: {e}, returning a silent chunk."
)
decoded_chunk = AudioSegment.silent(0, 24000)
logger.debug(f"Returning the decoded chunk")
return decoded_chunk.raw_data # type: ignore
async def save(
self,
audio_fname: Union[str, bytes],
) -> None:
await self.chunkify()
self.file.seek(0)
audio: AudioSegment = AudioSegment.from_raw(
self.file, sample_width=2, frame_rate=24000, channels=1
)
logger.debug(f"Exporting the audio")
audio.export(audio_fname)
logger.info(f"Saved the audio to: {audio_fname}")
class EdgeTTSProvider(BaseTTSProvider):
def __init__(self, config: GeneralConfig):
# TTS provider specific config
config.voice_name = config.voice_name or "en-US-GuyNeural"
config.output_format = config.output_format or "audio-24khz-48kbitrate-mono-mp3"
config.voice_rate = config.voice_rate or "+0%"
config.voice_volume = config.voice_volume or "+0%"
config.voice_pitch = config.voice_pitch or "+0Hz"
config.proxy = config.proxy or None
# 0.000$ per 1 million characters
# or 0.000$ per 1000 characters
self.price = 0.000
super().__init__(config)
def __str__(self) -> str:
return f"{self.config}"
def validate_config(self):
supported_voices = asyncio.run(get_supported_voices())
# logger.debug(f"Supported voices: {supported_voices}")
if self.config.voice_name not in supported_voices:
raise ValueError(
f"EdgeTTS: Unsupported voice name: {self.config.voice_name}"
)
def text_to_speech(
self,
text: str,
output_file: str,
audio_tags: AudioTags,
):
communicate = CommWithPauses(
text=text,
voice_name=self.config.voice_name,
break_string=self.get_break_string().strip(),
break_duration=int(self.config.break_duration),
rate=self.config.voice_rate,
volume=self.config.voice_volume,
pitch=self.config.voice_pitch,
proxy=self.config.proxy,
)
asyncio.run(communicate.save(output_file))
set_audio_tags(output_file, audio_tags)
def estimate_cost(self, total_chars):
return math.ceil(total_chars / 1000) * self.price
def get_break_string(self):
return " @BRK#"
def get_output_file_extension(self):
if self.config.output_format.endswith("mp3"):
return "mp3"
else:
# Only mp3 supported in edge-tts https://github.com/rany2/edge-tts/issues/179
raise NotImplementedError(
f"Unknown file extension for output format: {self.config.output_format}. Only mp3 supported in edge-tts. See https://github.com/rany2/edge-tts/issues/179."
)