Merge pull request #71 from p0n1/fix-edge-pause

Fix edge pause
p0n1 · Jun 28, 2024 · 99748a9 · 99748a9
2 parents d9f70c2 + d16d571
commit 99748a9
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 87 deletions.
diff --git a/README.md b/README.md
@@ -83,7 +83,8 @@ python3 main.py -h
 usage: main.py [-h] [--tts {azure,openai,edge}]
                [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--preview]
                [--no_prompt] [--language LANGUAGE]
-               [--newline_mode {single,double}]
+               [--newline_mode {single,double,none}]
+               [--title_mode {auto,tag_text,first_few}]
                [--chapter_start CHAPTER_START] [--chapter_end CHAPTER_END]
                [--output_text] [--remove_endnotes] [--voice_name VOICE_NAME]
                [--output_format OUTPUT_FORMAT] [--model_name MODEL_NAME]
@@ -126,13 +127,20 @@ options:
                         different strategies in this tool, especially for
                         Chinese characters. For Chinese books, use zh-CN, zh-
                         TW, or zh-HK.
-  --newline_mode {single,double}
-                        Choose the mode of detecting new paragraphs: 'single'
-                        or 'double'. 'single' means a single newline
+  --newline_mode {single,double,none}
+                        Choose the mode of detecting new paragraphs: 'single',
+                        'double', or 'none'. 'single' means a single newline
                         character, while 'double' means two consecutive
-                        newline characters. (default: double, works for most
+                        newline characters. 'none' means all newline
+                        characters will be replace with blank so paragraphs
+                        will not be detected. (default: double, works for most
                         ebooks but will detect less paragraphs for some
                         ebooks)
+  --title_mode {auto,tag_text,first_few}
+                        Choose the parse mode for chapter title, 'tag_text'
+                        search 'title','h1','h2','h3' tag for title,
+                        'first_few' set first 60 characters as title, 'auto'
+                        auto apply the best mode for current chapter.
   --chapter_start CHAPTER_START
                         Chapter start index (default: 1, starting from 1)
   --chapter_end CHAPTER_END
@@ -170,11 +178,12 @@ edge specific:
   --proxy PROXY         Proxy server for the TTS provider. Format:
                         http://[username:password@]proxy.server:port
 
-azure specific:
+azure/edge specific:
   --break_duration BREAK_DURATION
                         Break duration in milliseconds for the different
-                        paragraphs or sections (default: 1250). Valid values
-                        range from 0 to 5000 milliseconds.
+                        paragraphs or sections (default: 1250, means 1.25 s).
+                        Valid values range from 0 to 5000 milliseconds for
+                        Azure TTS.
 ```  
 
 **Example**:
@@ -249,7 +258,7 @@ Check https://platform.openai.com/docs/quickstart/account-setup. Make sure you c
 
 Edge TTS and Azure TTS are almost same, the difference is that Edge TTS don't require API Key because it's based on Edge read aloud functionality, and parameters are restricted a bit, like [custom ssml](https://github.com/rany2/edge-tts#custom-ssml).
 
-Check https://github.com/p0n1/epub_to_audiobook/blob/main/audiobook_generator/tts_providers/edge_tts_provider.py#L17 for supported voices.
+Check https://gist.github.com/BettyJJ/17cbaa1de96235a7f5773b8690a20462 for supported voices.
 
 **If you want to try this project quickly, Edge TTS is highly recommended.**
 

diff --git a/audiobook_generator/book_parsers/epub_book_parser.py b/audiobook_generator/book_parsers/epub_book_parser.py
@@ -53,6 +53,8 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
                 cleaned_text = re.sub(r"[\n]+", break_string, raw.strip())
             elif self.config.newline_mode == "double":
                 cleaned_text = re.sub(r"[\n]{2,}", break_string, raw.strip())
+            elif self.config.newline_mode == "none":
+                cleaned_text = re.sub(r"[\n]+", " ", raw.strip())
             else:
                 raise ValueError(f"Invalid newline mode: {self.config.newline_mode}")
 

diff --git a/audiobook_generator/config/general_config.py b/audiobook_generator/config/general_config.py
@@ -22,7 +22,7 @@ def __init__(self, args):
         self.output_format = args.output_format
         self.model_name = args.model_name
 
-        # TTS provider: Azure specific arguments
+        # TTS provider: Azure & Edge TTS specific arguments
         self.break_duration = args.break_duration
 
         # TTS provider: Edge specific arguments

diff --git a/audiobook_generator/core/audiobook_generator.py b/audiobook_generator/core/audiobook_generator.py
@@ -103,6 +103,11 @@ def run(self):
                     output_file,
                     audio_tags,
                 )
+                logger.info(
+                    f"✅ Converted chapter {idx}/{len(chapters)}: {title}"
+                )
+            logger.info(f"All chapters converted. 🎉🎉🎉")
+
         except KeyboardInterrupt:
             logger.info("Job stopped by user.")
             exit()
diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py
@@ -14,8 +14,6 @@
 
 logger = logging.getLogger(__name__)
 
-MAX_RETRIES = 12  # Max_retries constant for network errors
-
 
 async def get_supported_voices():
     # List all available voices and their attributes.
@@ -30,67 +28,80 @@ async def get_supported_voices():
 
     for voice in voices:
         result[voice["ShortName"]] = voice["Locale"]
-       
+
     return result
 
+
+# Credit: https://gist.github.com/moha-abdi/8ddbcb206c38f592c65ada1e5479f2bf
+# @phuchoang2603 contributed pause support in https://github.com/p0n1/epub_to_audiobook/pull/45
 class CommWithPauses(Communicate):
     # This class uses edge_tts to generate text
     # but with pauses for example:- text: 'Hello
-    # this is simple text. [pause: 2s] Paused 2s'
+    # this is simple text. [pause: 1000] Paused 1000ms'
     def __init__(
         self,
         text: str,
         voice_name: str,
-        **kwargs
+        break_string: str,
+        break_duration: int = 1250,
+        **kwargs,
     ) -> None:
         super().__init__(text, voice_name, **kwargs)
+        self.break_string = break_string
+        self.break_duration = int(break_duration)
         self.parsed = self.parse_text()
         self.file = io.BytesIO()
 
     def parse_text(self):
-        if not "[pause:" in self.text:
-            return [(0, self.text)]
-
-        parts = self.text.split("[pause:")
-        for part in parts:
-            if "]" in part:
-                pause_time, content = part.split("]", 1)
-                yield int(pause_time), content.strip()
-
-            else:
-                content = part
-                yield 0, content.strip()
+        logger.debug(
+            f"Parsing the text, looking for break/pauses in text: <{self.text}>"
+        )
+        if self.break_string not in self.text:
+            logger.debug(f"No break/pauses found in the text")
+            return [self.text]
+
+        parts = self.text.split(self.break_string)
+        logger.debug(f"split into <{len(parts)}> parts: {parts}")
+        return parts
 
     async def chunkify(self):
-        for pause_time, content in self.parsed:
-            if pause_time:
-                pause_bytes = self.generate_pause(pause_time)
+        logger.debug(f"Chunkifying the text")
+        for content in self.parsed:
+            audio_bytes = await self.generate_audio(content)
+            self.file.write(audio_bytes)
+            if content != self.parsed[-1] and self.break_duration > 0:
+                # only same break duration for all breaks is supported now
+                pause_bytes = self.generate_pause(self.break_duration)
                 self.file.write(pause_bytes)
-
-            if content:
-                audio_bytes = await self.generate_audio(content)
-                self.file.write(audio_bytes)
+        logger.debug(f"Chunkifying done")
 
     def generate_pause(self, time: int) -> bytes:
+        logger.debug(f"Generating pause")
         # pause time should be provided in ms
         silent: AudioSegment = AudioSegment.silent(time, 24000)
-        return silent.raw_data
+        return silent.raw_data  # type: ignore
 
     async def generate_audio(self, text: str) -> bytes:
+        logger.debug(f"Generating audio for: <{text}>")
         # this genertes the real TTS using edge_tts for this part.
         temp_chunk = io.BytesIO()
         self.text = text
         async for chunk in self.stream():
-            if chunk['type'] == 'audio':
-                temp_chunk.write(chunk['data'])
+            if chunk["type"] == "audio":
+                temp_chunk.write(chunk["data"])
 
         temp_chunk.seek(0)
         # handle the case where the chunk is empty
         try:
+            logger.debug(f"Decoding the chunk")
             decoded_chunk = AudioSegment.from_mp3(temp_chunk)
-        except:
+        except Exception as e:
+            logger.warning(
+                f"Failed to decode the chunk, reason: {e}, returning a silent chunk."
+            )
             decoded_chunk = AudioSegment.silent(0, 24000)
-        return decoded_chunk.raw_data
+        logger.debug(f"Returning the decoded chunk")
+        return decoded_chunk.raw_data  # type: ignore
 
     async def save(
         self,
@@ -99,16 +110,15 @@ async def save(
     ) -> None:
         # Save the audio and metadata to the specified files.
         await self.chunkify()
-        await super().save(audio_fname, metadata_fname)
 
         self.file.seek(0)
         audio: AudioSegment = AudioSegment.from_raw(
-            self.file,
-            sample_width=2,
-            frame_rate=24000,
-            channels=1
+            self.file, sample_width=2, frame_rate=24000, channels=1
         )
+        logger.debug(f"Exporting the audio")
         audio.export(audio_fname)
+        logger.info(f"Saved the audio to: {audio_fname}")
+
 
 class EdgeTTSProvider(BaseTTSProvider):
     def __init__(self, config: GeneralConfig):
@@ -131,32 +141,29 @@ def __str__(self) -> str:
 
     async def validate_config(self):
         if self.config.voice_name not in await get_supported_voices():
-            raise ValueError(f"EdgeTTS: Unsupported voice name: {self.config.voice_name}")
+            raise ValueError(
+                f"EdgeTTS: Unsupported voice name: {self.config.voice_name}"
+            )
 
     def text_to_speech(
-            self,
-            text: str,
-            output_file: str,
-            audio_tags: AudioTags,
+        self,
+        text: str,
+        output_file: str,
+        audio_tags: AudioTags,
     ):
-        # Replace break string with pause tag
-        text = text.replace(
-            self.get_break_string().strip(),
-            f"[pause: {self.config.break_duration}]"
-        )
 
         communicate = CommWithPauses(
             text=text,
             voice_name=self.config.voice_name,
+            break_string=self.get_break_string().strip(),
+            break_duration=int(self.config.break_duration),
             rate=self.config.voice_rate,
             volume=self.config.voice_volume,
             pitch=self.config.voice_pitch,
-            proxy=self.config.proxy
+            proxy=self.config.proxy,
         )
 
-        asyncio.run(
-            communicate.save(output_file)
-        )
+        asyncio.run(communicate.save(output_file))
 
         set_audio_tags(output_file, audio_tags)
 
@@ -167,21 +174,10 @@ def get_break_string(self):
         return " @BRK#"
 
     def get_output_file_extension(self):
-        if self.config.output_format.startswith("amr"):
-            return "amr"
-        elif self.config.output_format.startswith("ogg"):
-            return "ogg"
-        elif self.config.output_format.endswith("truesilk"):
-            return "silk"
-        elif self.config.output_format.endswith("pcm"):
-            return "pcm"
-        elif self.config.output_format.startswith("raw"):
-            return "wav"
-        elif self.config.output_format.startswith("webm"):
-            return "webm"
-        elif self.config.output_format.endswith("opus"):
-            return "opus"
-        elif self.config.output_format.endswith("mp3"):
+        if self.config.output_format.endswith("mp3"):
             return "mp3"
         else:
-            raise NotImplementedError(f"Unknown file extension for output format: {self.config.output_format}")
+            # Only mp3 supported in edge-tts https://github.com/rany2/edge-tts/issues/179
+            raise NotImplementedError(
+                f"Unknown file extension for output format: {self.config.output_format}. Only mp3 supported in edge-tts. See https://github.com/rany2/edge-tts/issues/179."
+            )
diff --git a/main.py b/main.py
@@ -3,7 +3,9 @@
 
 from audiobook_generator.config.general_config import GeneralConfig
 from audiobook_generator.core.audiobook_generator import AudiobookGenerator
-from audiobook_generator.tts_providers.base_tts_provider import get_supported_tts_providers
+from audiobook_generator.tts_providers.base_tts_provider import (
+    get_supported_tts_providers,
+)
 
 logging.basicConfig(
     level=logging.INFO,
@@ -25,7 +27,7 @@ def handle_args():
     )
     parser.add_argument(
         "--log",
-        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
         default="INFO",
         help="Log level (default: INFO), can be DEBUG, INFO, WARNING, ERROR, CRITICAL",
     )
@@ -46,9 +48,9 @@ def handle_args():
     )
     parser.add_argument(
         "--newline_mode",
-        choices=["single", "double"],
+        choices=["single", "double", "none"],
         default="double",
-        help="Choose the mode of detecting new paragraphs: 'single' or 'double'. 'single' means a single newline character, while 'double' means two consecutive newline characters. (default: double, works for most ebooks but will detect less paragraphs for some ebooks)",
+        help="Choose the mode of detecting new paragraphs: 'single', 'double', or 'none'. 'single' means a single newline character, while 'double' means two consecutive newline characters. 'none' means all newline characters will be replace with blank so paragraphs will not be detected. (default: double, works for most ebooks but will detect less paragraphs for some ebooks)",
     )
     parser.add_argument(
         "--title_mode",
@@ -97,38 +99,38 @@ def handle_args():
     edge_tts_group = parser.add_argument_group(title="edge specific")
     edge_tts_group.add_argument(
         "--voice_rate",
-        help='''
+        help="""
             Speaking rate of the text. Valid relative values range from -50%%(--xxx='-50%%') to +100%%. 
             For negative value use format --arg=value,
-        '''
+        """,
     )
 
     edge_tts_group.add_argument(
         "--voice_volume",
-        help='''
+        help="""
             Volume level of the speaking voice. Valid relative values floor to -100%%.
             For negative value use format --arg=value,
-        '''
+        """,
     )
 
     edge_tts_group.add_argument(
         "--voice_pitch",
-        help='''
+        help="""
             Baseline pitch for the text.Valid relative values like -80Hz,+50Hz, pitch changes should be within 0.5 to 1.5 times the original audio.
             For negative value use format --arg=value,
-        '''
+        """,
     )
 
     edge_tts_group.add_argument(
         "--proxy",
         help="Proxy server for the TTS provider. Format: http://[username:password@]proxy.server:port",
     )
 
-    azure_tts_group = parser.add_argument_group(title="azure specific")
-    azure_tts_group.add_argument(
+    azure_edge_tts_group = parser.add_argument_group(title="azure/edge specific")
+    azure_edge_tts_group.add_argument(
         "--break_duration",
         default="1250",
-        help="Break duration in milliseconds for the different paragraphs or sections (default: 1250). Valid values range from 0 to 5000 milliseconds.",
+        help="Break duration in milliseconds for the different paragraphs or sections (default: 1250, means 1.25 s). Valid values range from 0 to 5000 milliseconds for Azure TTS.",
     )
 
     args = parser.parse_args()