Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix edge pause #71

Merged
merged 5 commits into from
Jun 28, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
fix: #56 bad audio for edge tts pause feature
  • Loading branch information
p0n1 committed Jun 26, 2024
commit 9ecd3d03851ea300a04d3bbb7e28a8215333ff66
24 changes: 19 additions & 5 deletions audiobook_generator/tts_providers/edge_tts_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@

logger = logging.getLogger(__name__)

MAX_RETRIES = 12 # Max_retries constant for network errors


async def get_supported_voices():
# List all available voices and their attributes.
# This pulls data from the URL used by Microsoft Edge to return a list of
Expand All @@ -33,6 +30,9 @@ async def get_supported_voices():

return result


# Credit: https://gist.github.com/moha-abdi/8ddbcb206c38f592c65ada1e5479f2bf
# @phuchoang2603 contributed pause support in https://github.com/p0n1/epub_to_audiobook/pull/45
class CommWithPauses(Communicate):
# This class uses edge_tts to generate text
# but with pauses for example:- text: 'Hello
Expand All @@ -48,22 +48,30 @@ def __init__(
self.file = io.BytesIO()

def parse_text(self):
logger.debug(f"Parsing the text, looking for pauses in text: {self.text}")
if not "[pause:" in self.text:
return [(0, self.text)]
logger.debug(f"No pauses found in the text")
yield 0, self.text

parts = self.text.split("[pause:")
logger.debug(f"split into parts: {parts}")
for part in parts:
if "]" in part:
pause_time, content = part.split("]", 1)
logger.debug(f"Pause time: {pause_time}, Content: {content.strip()}")
yield int(pause_time), content.strip()

else:
content = part
logger.debug(f"No pause time, Content: {content.strip()}")
yield 0, content.strip()

async def chunkify(self):
logger.debug(f"Chunkifying the text")
for pause_time, content in self.parsed:
if pause_time:
logger.debug(f"pause_time: {pause_time}")
logger.debug(f"content: {content}")
if pause_time > 0:
pause_bytes = self.generate_pause(pause_time)
self.file.write(pause_bytes)

Expand All @@ -77,6 +85,7 @@ def generate_pause(self, time: int) -> bytes:
return silent.raw_data

async def generate_audio(self, text: str) -> bytes:
logger.debug(f"Generating audio for: {text}")
# this genertes the real TTS using edge_tts for this part.
temp_chunk = io.BytesIO()
self.text = text
Expand All @@ -87,8 +96,10 @@ async def generate_audio(self, text: str) -> bytes:
temp_chunk.seek(0)
# handle the case where the chunk is empty
try:
logger.debug(f"Decoding the chunk")
decoded_chunk = AudioSegment.from_mp3(temp_chunk)
except:
logger.debug(f"Empty chunk")
decoded_chunk = AudioSegment.silent(0, 24000)
return decoded_chunk.raw_data

Expand Down Expand Up @@ -139,12 +150,15 @@ def text_to_speech(
output_file: str,
audio_tags: AudioTags,
):

# Replace break string with pause tag
text = text.replace(
self.get_break_string().strip(),
f"[pause: {self.config.break_duration}]"
)

logger.debug(f"Text to speech, adding pause mark: {text}")

communicate = CommWithPauses(
text=text,
voice_name=self.config.voice_name,
Expand Down