Skip to content

Commit

Permalink
Merge pull request #71 from p0n1/fix-edge-pause
Browse files Browse the repository at this point in the history
Fix edge pause
  • Loading branch information
p0n1 authored Jun 28, 2024
2 parents d9f70c2 + d16d571 commit 99748a9
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 87 deletions.
27 changes: 18 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ python3 main.py -h
usage: main.py [-h] [--tts {azure,openai,edge}]
[--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--preview]
[--no_prompt] [--language LANGUAGE]
[--newline_mode {single,double}]
[--newline_mode {single,double,none}]
[--title_mode {auto,tag_text,first_few}]
[--chapter_start CHAPTER_START] [--chapter_end CHAPTER_END]
[--output_text] [--remove_endnotes] [--voice_name VOICE_NAME]
[--output_format OUTPUT_FORMAT] [--model_name MODEL_NAME]
Expand Down Expand Up @@ -126,13 +127,20 @@ options:
different strategies in this tool, especially for
Chinese characters. For Chinese books, use zh-CN, zh-
TW, or zh-HK.
--newline_mode {single,double}
Choose the mode of detecting new paragraphs: 'single'
or 'double'. 'single' means a single newline
--newline_mode {single,double,none}
Choose the mode of detecting new paragraphs: 'single',
'double', or 'none'. 'single' means a single newline
character, while 'double' means two consecutive
newline characters. (default: double, works for most
newline characters. 'none' means all newline
characters will be replace with blank so paragraphs
will not be detected. (default: double, works for most
ebooks but will detect less paragraphs for some
ebooks)
--title_mode {auto,tag_text,first_few}
Choose the parse mode for chapter title, 'tag_text'
search 'title','h1','h2','h3' tag for title,
'first_few' set first 60 characters as title, 'auto'
auto apply the best mode for current chapter.
--chapter_start CHAPTER_START
Chapter start index (default: 1, starting from 1)
--chapter_end CHAPTER_END
Expand Down Expand Up @@ -170,11 +178,12 @@ edge specific:
--proxy PROXY Proxy server for the TTS provider. Format:
http://[username:password@]proxy.server:port

azure specific:
azure/edge specific:
--break_duration BREAK_DURATION
Break duration in milliseconds for the different
paragraphs or sections (default: 1250). Valid values
range from 0 to 5000 milliseconds.
paragraphs or sections (default: 1250, means 1.25 s).
Valid values range from 0 to 5000 milliseconds for
Azure TTS.
```
**Example**:
Expand Down Expand Up @@ -249,7 +258,7 @@ Check https://platform.openai.com/docs/quickstart/account-setup. Make sure you c
Edge TTS and Azure TTS are almost same, the difference is that Edge TTS don't require API Key because it's based on Edge read aloud functionality, and parameters are restricted a bit, like [custom ssml](https://github.com/rany2/edge-tts#custom-ssml).
Check https://github.com/p0n1/epub_to_audiobook/blob/main/audiobook_generator/tts_providers/edge_tts_provider.py#L17 for supported voices.
Check https://gist.github.com/BettyJJ/17cbaa1de96235a7f5773b8690a20462 for supported voices.
**If you want to try this project quickly, Edge TTS is highly recommended.**
Expand Down
2 changes: 2 additions & 0 deletions audiobook_generator/book_parsers/epub_book_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
cleaned_text = re.sub(r"[\n]+", break_string, raw.strip())
elif self.config.newline_mode == "double":
cleaned_text = re.sub(r"[\n]{2,}", break_string, raw.strip())
elif self.config.newline_mode == "none":
cleaned_text = re.sub(r"[\n]+", " ", raw.strip())
else:
raise ValueError(f"Invalid newline mode: {self.config.newline_mode}")

Expand Down
2 changes: 1 addition & 1 deletion audiobook_generator/config/general_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self, args):
self.output_format = args.output_format
self.model_name = args.model_name

# TTS provider: Azure specific arguments
# TTS provider: Azure & Edge TTS specific arguments
self.break_duration = args.break_duration

# TTS provider: Edge specific arguments
Expand Down
5 changes: 5 additions & 0 deletions audiobook_generator/core/audiobook_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ def run(self):
output_file,
audio_tags,
)
logger.info(
f"✅ Converted chapter {idx}/{len(chapters)}: {title}"
)
logger.info(f"All chapters converted. 🎉🎉🎉")

except KeyboardInterrupt:
logger.info("Job stopped by user.")
exit()
124 changes: 60 additions & 64 deletions audiobook_generator/tts_providers/edge_tts_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@

logger = logging.getLogger(__name__)

MAX_RETRIES = 12 # Max_retries constant for network errors


async def get_supported_voices():
# List all available voices and their attributes.
Expand All @@ -30,67 +28,80 @@ async def get_supported_voices():

for voice in voices:
result[voice["ShortName"]] = voice["Locale"]

return result


# Credit: https://gist.github.com/moha-abdi/8ddbcb206c38f592c65ada1e5479f2bf
# @phuchoang2603 contributed pause support in https://github.com/p0n1/epub_to_audiobook/pull/45
class CommWithPauses(Communicate):
# This class uses edge_tts to generate text
# but with pauses for example:- text: 'Hello
# this is simple text. [pause: 2s] Paused 2s'
# this is simple text. [pause: 1000] Paused 1000ms'
def __init__(
self,
text: str,
voice_name: str,
**kwargs
break_string: str,
break_duration: int = 1250,
**kwargs,
) -> None:
super().__init__(text, voice_name, **kwargs)
self.break_string = break_string
self.break_duration = int(break_duration)
self.parsed = self.parse_text()
self.file = io.BytesIO()

def parse_text(self):
if not "[pause:" in self.text:
return [(0, self.text)]

parts = self.text.split("[pause:")
for part in parts:
if "]" in part:
pause_time, content = part.split("]", 1)
yield int(pause_time), content.strip()

else:
content = part
yield 0, content.strip()
logger.debug(
f"Parsing the text, looking for break/pauses in text: <{self.text}>"
)
if self.break_string not in self.text:
logger.debug(f"No break/pauses found in the text")
return [self.text]

parts = self.text.split(self.break_string)
logger.debug(f"split into <{len(parts)}> parts: {parts}")
return parts

async def chunkify(self):
for pause_time, content in self.parsed:
if pause_time:
pause_bytes = self.generate_pause(pause_time)
logger.debug(f"Chunkifying the text")
for content in self.parsed:
audio_bytes = await self.generate_audio(content)
self.file.write(audio_bytes)
if content != self.parsed[-1] and self.break_duration > 0:
# only same break duration for all breaks is supported now
pause_bytes = self.generate_pause(self.break_duration)
self.file.write(pause_bytes)

if content:
audio_bytes = await self.generate_audio(content)
self.file.write(audio_bytes)
logger.debug(f"Chunkifying done")

def generate_pause(self, time: int) -> bytes:
logger.debug(f"Generating pause")
# pause time should be provided in ms
silent: AudioSegment = AudioSegment.silent(time, 24000)
return silent.raw_data
return silent.raw_data # type: ignore

async def generate_audio(self, text: str) -> bytes:
logger.debug(f"Generating audio for: <{text}>")
# this genertes the real TTS using edge_tts for this part.
temp_chunk = io.BytesIO()
self.text = text
async for chunk in self.stream():
if chunk['type'] == 'audio':
temp_chunk.write(chunk['data'])
if chunk["type"] == "audio":
temp_chunk.write(chunk["data"])

temp_chunk.seek(0)
# handle the case where the chunk is empty
try:
logger.debug(f"Decoding the chunk")
decoded_chunk = AudioSegment.from_mp3(temp_chunk)
except:
except Exception as e:
logger.warning(
f"Failed to decode the chunk, reason: {e}, returning a silent chunk."
)
decoded_chunk = AudioSegment.silent(0, 24000)
return decoded_chunk.raw_data
logger.debug(f"Returning the decoded chunk")
return decoded_chunk.raw_data # type: ignore

async def save(
self,
Expand All @@ -99,16 +110,15 @@ async def save(
) -> None:
# Save the audio and metadata to the specified files.
await self.chunkify()
await super().save(audio_fname, metadata_fname)

self.file.seek(0)
audio: AudioSegment = AudioSegment.from_raw(
self.file,
sample_width=2,
frame_rate=24000,
channels=1
self.file, sample_width=2, frame_rate=24000, channels=1
)
logger.debug(f"Exporting the audio")
audio.export(audio_fname)
logger.info(f"Saved the audio to: {audio_fname}")


class EdgeTTSProvider(BaseTTSProvider):
def __init__(self, config: GeneralConfig):
Expand All @@ -131,32 +141,29 @@ def __str__(self) -> str:

async def validate_config(self):
if self.config.voice_name not in await get_supported_voices():
raise ValueError(f"EdgeTTS: Unsupported voice name: {self.config.voice_name}")
raise ValueError(
f"EdgeTTS: Unsupported voice name: {self.config.voice_name}"
)

def text_to_speech(
self,
text: str,
output_file: str,
audio_tags: AudioTags,
self,
text: str,
output_file: str,
audio_tags: AudioTags,
):
# Replace break string with pause tag
text = text.replace(
self.get_break_string().strip(),
f"[pause: {self.config.break_duration}]"
)

communicate = CommWithPauses(
text=text,
voice_name=self.config.voice_name,
break_string=self.get_break_string().strip(),
break_duration=int(self.config.break_duration),
rate=self.config.voice_rate,
volume=self.config.voice_volume,
pitch=self.config.voice_pitch,
proxy=self.config.proxy
proxy=self.config.proxy,
)

asyncio.run(
communicate.save(output_file)
)
asyncio.run(communicate.save(output_file))

set_audio_tags(output_file, audio_tags)

Expand All @@ -167,21 +174,10 @@ def get_break_string(self):
return " @BRK#"

def get_output_file_extension(self):
if self.config.output_format.startswith("amr"):
return "amr"
elif self.config.output_format.startswith("ogg"):
return "ogg"
elif self.config.output_format.endswith("truesilk"):
return "silk"
elif self.config.output_format.endswith("pcm"):
return "pcm"
elif self.config.output_format.startswith("raw"):
return "wav"
elif self.config.output_format.startswith("webm"):
return "webm"
elif self.config.output_format.endswith("opus"):
return "opus"
elif self.config.output_format.endswith("mp3"):
if self.config.output_format.endswith("mp3"):
return "mp3"
else:
raise NotImplementedError(f"Unknown file extension for output format: {self.config.output_format}")
# Only mp3 supported in edge-tts https://github.com/rany2/edge-tts/issues/179
raise NotImplementedError(
f"Unknown file extension for output format: {self.config.output_format}. Only mp3 supported in edge-tts. See https://github.com/rany2/edge-tts/issues/179."
)
28 changes: 15 additions & 13 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

from audiobook_generator.config.general_config import GeneralConfig
from audiobook_generator.core.audiobook_generator import AudiobookGenerator
from audiobook_generator.tts_providers.base_tts_provider import get_supported_tts_providers
from audiobook_generator.tts_providers.base_tts_provider import (
get_supported_tts_providers,
)

logging.basicConfig(
level=logging.INFO,
Expand All @@ -25,7 +27,7 @@ def handle_args():
)
parser.add_argument(
"--log",
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
default="INFO",
help="Log level (default: INFO), can be DEBUG, INFO, WARNING, ERROR, CRITICAL",
)
Expand All @@ -46,9 +48,9 @@ def handle_args():
)
parser.add_argument(
"--newline_mode",
choices=["single", "double"],
choices=["single", "double", "none"],
default="double",
help="Choose the mode of detecting new paragraphs: 'single' or 'double'. 'single' means a single newline character, while 'double' means two consecutive newline characters. (default: double, works for most ebooks but will detect less paragraphs for some ebooks)",
help="Choose the mode of detecting new paragraphs: 'single', 'double', or 'none'. 'single' means a single newline character, while 'double' means two consecutive newline characters. 'none' means all newline characters will be replace with blank so paragraphs will not be detected. (default: double, works for most ebooks but will detect less paragraphs for some ebooks)",
)
parser.add_argument(
"--title_mode",
Expand Down Expand Up @@ -97,38 +99,38 @@ def handle_args():
edge_tts_group = parser.add_argument_group(title="edge specific")
edge_tts_group.add_argument(
"--voice_rate",
help='''
help="""
Speaking rate of the text. Valid relative values range from -50%%(--xxx='-50%%') to +100%%.
For negative value use format --arg=value,
'''
""",
)

edge_tts_group.add_argument(
"--voice_volume",
help='''
help="""
Volume level of the speaking voice. Valid relative values floor to -100%%.
For negative value use format --arg=value,
'''
""",
)

edge_tts_group.add_argument(
"--voice_pitch",
help='''
help="""
Baseline pitch for the text.Valid relative values like -80Hz,+50Hz, pitch changes should be within 0.5 to 1.5 times the original audio.
For negative value use format --arg=value,
'''
""",
)

edge_tts_group.add_argument(
"--proxy",
help="Proxy server for the TTS provider. Format: http://[username:password@]proxy.server:port",
)

azure_tts_group = parser.add_argument_group(title="azure specific")
azure_tts_group.add_argument(
azure_edge_tts_group = parser.add_argument_group(title="azure/edge specific")
azure_edge_tts_group.add_argument(
"--break_duration",
default="1250",
help="Break duration in milliseconds for the different paragraphs or sections (default: 1250). Valid values range from 0 to 5000 milliseconds.",
help="Break duration in milliseconds for the different paragraphs or sections (default: 1250, means 1.25 s). Valid values range from 0 to 5000 milliseconds for Azure TTS.",
)

args = parser.parse_args()
Expand Down

0 comments on commit 99748a9

Please sign in to comment.