From b17b05fa2775ba566a0d152f7a9b1512dbafe86e Mon Sep 17 00:00:00 2001 From: Flavio Schneider Date: Sun, 6 Aug 2023 00:08:01 +0200 Subject: [PATCH] feat: add input streaming --- elevenlabs/api/tts.py | 73 +++++++++++++++++++++++++++++++++++++++++++ elevenlabs/simple.py | 12 ++++--- setup.py | 3 +- 3 files changed, 83 insertions(+), 5 deletions(-) diff --git a/elevenlabs/api/tts.py b/elevenlabs/api/tts.py index 49ace30..1e9f251 100644 --- a/elevenlabs/api/tts.py +++ b/elevenlabs/api/tts.py @@ -1,7 +1,13 @@ from __future__ import annotations +import base64 +import json +import os from typing import Iterator, Optional +import websockets +from websockets.sync.client import connect + from .base import API, api_base_url_v1 from .model import Model from .voice import Voice @@ -40,3 +46,70 @@ def generate_stream( for chunk in response.iter_content(chunk_size=stream_chunk_size): if chunk: yield chunk + + @staticmethod + def generate_stream_input( + text: Iterator[str], voice: Voice, model: Model, api_key: Optional[str] = None + ) -> Iterator[bytes]: + message = ( + "Currently input streaming is only supported for eleven_monolingual_v1" + " model" + ) + assert ( + model.model_id == "eleven_monolingual_v1" + ), f"{message}, got {model.model_id}" + + BOS = json.dumps( + dict( + text=" ", + try_trigger_generation=True, + generation_config=dict( + chunk_length_schedule=[50], + model_id=model.model_id, + voice_settings=voice.settings.dict() if voice.settings else None, + ), + ) + ) + EOS = json.dumps(dict(text="")) + + with connect( + f"wss://api.elevenlabs.io/v1/text-to-speech/{voice.voice_id}/stream-input", + additional_headers={ + "xi-api-key": api_key or os.environ.get("ELEVEN_API_KEY"), + "model_id": model.model_id, + }, + ) as websocket: + # Send beginning of stream + websocket.send(BOS) + + # Stream text chunks and receive audio + text_block = "" + for text_chunk in text: + + text_block += text_chunk + if text_block.endswith((".", "!", "?")): + text_block += " " + if not text_block.endswith(" "): + continue + + data = dict(text=text_block, try_trigger_generation=True) + text_block = "" + websocket.send(json.dumps(data)) + try: + data = json.loads(websocket.recv(1e-4)) + if data["audio"]: + yield base64.b64decode(data["audio"]) # type: ignore + except TimeoutError: + pass + + # Send end of stream + websocket.send(EOS) + + # Receive remaining audio + while True: + try: + data = json.loads(websocket.recv()) + if data["audio"]: + yield base64.b64decode(data["audio"]) # type: ignore + except websockets.exceptions.ConnectionClosed: + break diff --git a/elevenlabs/simple.py b/elevenlabs/simple.py index d5f527c..42f4440 100644 --- a/elevenlabs/simple.py +++ b/elevenlabs/simple.py @@ -91,7 +91,7 @@ def is_voice_id(val: str) -> bool: def generate( - text: str, + text: Union[str, Iterator[str]], api_key: Optional[str] = None, voice: Union[str, Voice] = VOICES_CACHE[2], # Bella model: Union[str, Model] = "eleven_monolingual_v1", @@ -123,8 +123,12 @@ def generate( assert isinstance(model, Model) if stream: - return TTS.generate_stream( - text, voice, model, stream_chunk_size, api_key=api_key, latency=latency - ) # noqa E501 + if isinstance(text, str): + return TTS.generate_stream( + text, voice, model, stream_chunk_size, api_key=api_key, latency=latency + ) # noqa E501 + elif isinstance(text, Iterator): + return TTS.generate_stream_input(text, voice, model, api_key=api_key) else: + assert isinstance(text, str) return TTS.generate(text, voice, model, api_key=api_key) diff --git a/setup.py b/setup.py index 840dd23..cbfb40c 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="elevenlabs", packages=find_packages(exclude=[]), - version="0.2.21", + version="0.2.22", description="The official elevenlabs python package.", long_description_content_type="text/markdown", author="Elevenlabs", @@ -13,6 +13,7 @@ "pydantic>=1.10,<2.0", "ipython>=7.0", "requests>=2.20", + "websockets>=11.0", ], classifiers=[ "Development Status :: 4 - Beta",