Skip to content

Commit

Permalink
feat:add tts-streaming config and future (langgenius#5492)
Browse files Browse the repository at this point in the history
  • Loading branch information
ic-xu authored Jul 9, 2024
1 parent b29a36f commit 6ef401a
Show file tree
Hide file tree
Showing 44 changed files with 1,281 additions and 359 deletions.
4 changes: 4 additions & 0 deletions api/constants/tts_auto_play_timeout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
TTS_AUTO_PLAY_TIMEOUT = 5

# sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
TTS_AUTO_PLAY_YIELD_CPU_TIME = 0.02
31 changes: 26 additions & 5 deletions api/controllers/console/app/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,36 @@ class ChatMessageTextApi(Resource):
@account_initialization_required
@get_app_model
def post(self, app_model):
from werkzeug.exceptions import InternalServerError

try:
parser = reqparse.RequestParser()
parser.add_argument('message_id', type=str, location='json')
parser.add_argument('text', type=str, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, location='json')
args = parser.parse_args()

message_id = args.get('message_id', None)
text = args.get('text', None)
if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
and app_model.workflow
and app_model.workflow.features_dict):
text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
else:
try:
voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get(
'voice')
except Exception:
voice = None
response = AudioService.transcript_tts(
app_model=app_model,
text=request.form['text'],
voice=request.form['voice'],
streaming=False
text=text,
message_id=message_id,
voice=voice
)

return {'data': response.data.decode('latin1')}
return response
except services.errors.app_model_config.AppModelConfigBrokenError:
logging.exception("App model config broken.")
raise AppUnavailableError()
Expand Down
30 changes: 25 additions & 5 deletions api/controllers/console/explore/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from controllers.console.explore.wraps import InstalledAppResource
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
from core.model_runtime.errors.invoke import InvokeError
from models.model import AppMode
from services.audio_service import AudioService
from services.errors.audio import (
AudioTooLargeServiceError,
Expand Down Expand Up @@ -70,16 +71,33 @@ def post(self, installed_app):

class ChatTextApi(InstalledAppResource):
def post(self, installed_app):
app_model = installed_app.app
from flask_restful import reqparse

app_model = installed_app.app
try:
parser = reqparse.RequestParser()
parser.add_argument('message_id', type=str, required=False, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, location='json')
args = parser.parse_args()

message_id = args.get('message_id')
if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
and app_model.workflow
and app_model.workflow.features_dict):
text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
else:
try:
voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get('voice')
except Exception:
voice = None
response = AudioService.transcript_tts(
app_model=app_model,
text=request.form['text'],
voice=request.form['voice'] if request.form.get('voice') else app_model.app_model_config.text_to_speech_dict.get('voice'),
streaming=False
message_id=message_id,
voice=voice
)
return {'data': response.data.decode('latin1')}
return response
except services.errors.app_model_config.AppModelConfigBrokenError:
logging.exception("App model config broken.")
raise AppUnavailableError()
Expand Down Expand Up @@ -108,3 +126,5 @@ def post(self, installed_app):

api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio')
api.add_resource(ChatTextApi, '/installed-apps/<uuid:installed_app_id>/text-to-audio', endpoint='installed_app_text')
# api.add_resource(ChatTextApiWithMessageId, '/installed-apps/<uuid:installed_app_id>/text-to-audio/message-id',
# endpoint='installed_app_text_with_message_id')
33 changes: 22 additions & 11 deletions api/controllers/service_api/app/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from controllers.service_api.wraps import FetchUserArg, WhereisUserArg, validate_app_token
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
from core.model_runtime.errors.invoke import InvokeError
from models.model import App, EndUser
from models.model import App, AppMode, EndUser
from services.audio_service import AudioService
from services.errors.audio import (
AudioTooLargeServiceError,
Expand Down Expand Up @@ -72,19 +72,30 @@ def post(self, app_model: App, end_user: EndUser):
class TextApi(Resource):
@validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.JSON))
def post(self, app_model: App, end_user: EndUser):
parser = reqparse.RequestParser()
parser.add_argument('text', type=str, required=True, nullable=False, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, required=False, nullable=False, location='json')
args = parser.parse_args()

try:
parser = reqparse.RequestParser()
parser.add_argument('message_id', type=str, required=False, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, location='json')
args = parser.parse_args()

message_id = args.get('message_id')
if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
and app_model.workflow
and app_model.workflow.features_dict):
text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
else:
try:
voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get(
'voice')
except Exception:
voice = None
response = AudioService.transcript_tts(
app_model=app_model,
text=args['text'],
end_user=end_user,
voice=args.get('voice'),
streaming=args['streaming']
message_id=message_id,
end_user=end_user.external_user_id,
voice=voice
)

return response
Expand Down
29 changes: 24 additions & 5 deletions api/controllers/web/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from controllers.web.wraps import WebApiResource
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
from core.model_runtime.errors.invoke import InvokeError
from models.model import App
from models.model import App, AppMode
from services.audio_service import AudioService
from services.errors.audio import (
AudioTooLargeServiceError,
Expand Down Expand Up @@ -69,16 +69,35 @@ def post(self, app_model: App, end_user):

class TextApi(WebApiResource):
def post(self, app_model: App, end_user):
from flask_restful import reqparse
try:
parser = reqparse.RequestParser()
parser.add_argument('message_id', type=str, required=False, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, location='json')
args = parser.parse_args()

message_id = args.get('message_id')
if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
and app_model.workflow
and app_model.workflow.features_dict):
text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
else:
try:
voice = args.get('voice') if args.get(
'voice') else app_model.app_model_config.text_to_speech_dict.get('voice')
except Exception:
voice = None

response = AudioService.transcript_tts(
app_model=app_model,
text=request.form['text'],
message_id=message_id,
end_user=end_user.external_user_id,
voice=request.form['voice'] if request.form.get('voice') else None,
streaming=False
voice=voice
)

return {'data': response.data.decode('latin1')}
return response
except services.errors.app_model_config.AppModelConfigBrokenError:
logging.exception("App model config broken.")
raise AppUnavailableError()
Expand Down
135 changes: 135 additions & 0 deletions api/core/app/apps/advanced_chat/app_generator_tts_publisher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import base64
import concurrent.futures
import logging
import queue
import re
import threading

from core.app.entities.queue_entities import QueueAgentMessageEvent, QueueLLMChunkEvent, QueueTextChunkEvent
from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType


class AudioTrunk:
def __init__(self, status: str, audio):
self.audio = audio
self.status = status


def _invoiceTTS(text_content: str, model_instance, tenant_id: str, voice: str):
if not text_content or text_content.isspace():
return
return model_instance.invoke_tts(
content_text=text_content.strip(),
user="responding_tts",
tenant_id=tenant_id,
voice=voice
)


def _process_future(future_queue, audio_queue):
while True:
try:
future = future_queue.get()
if future is None:
break
for audio in future.result():
audio_base64 = base64.b64encode(bytes(audio))
audio_queue.put(AudioTrunk("responding", audio=audio_base64))
except Exception as e:
logging.getLogger(__name__).warning(e)
break
audio_queue.put(AudioTrunk("finish", b''))


class AppGeneratorTTSPublisher:

def __init__(self, tenant_id: str, voice: str):
self.logger = logging.getLogger(__name__)
self.tenant_id = tenant_id
self.msg_text = ''
self._audio_queue = queue.Queue()
self._msg_queue = queue.Queue()
self.match = re.compile(r'[。.!?]')
self.model_manager = ModelManager()
self.model_instance = self.model_manager.get_default_model_instance(
tenant_id=self.tenant_id,
model_type=ModelType.TTS
)
self.voices = self.model_instance.get_tts_voices()
values = [voice.get('value') for voice in self.voices]
self.voice = voice
if not voice or voice not in values:
self.voice = self.voices[0].get('value')
self.MAX_SENTENCE = 2
self._last_audio_event = None
self._runtime_thread = threading.Thread(target=self._runtime).start()
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)

def publish(self, message):
try:
self._msg_queue.put(message)
except Exception as e:
self.logger.warning(e)

def _runtime(self):
future_queue = queue.Queue()
threading.Thread(target=_process_future, args=(future_queue, self._audio_queue)).start()
while True:
try:
message = self._msg_queue.get()
if message is None:
if self.msg_text and len(self.msg_text.strip()) > 0:
futures_result = self.executor.submit(_invoiceTTS, self.msg_text,
self.model_instance, self.tenant_id, self.voice)
future_queue.put(futures_result)
break
elif isinstance(message.event, QueueAgentMessageEvent | QueueLLMChunkEvent):
self.msg_text += message.event.chunk.delta.message.content
elif isinstance(message.event, QueueTextChunkEvent):
self.msg_text += message.event.text
self.last_message = message
sentence_arr, text_tmp = self._extract_sentence(self.msg_text)
if len(sentence_arr) >= min(self.MAX_SENTENCE, 7):
self.MAX_SENTENCE += 1
text_content = ''.join(sentence_arr)
futures_result = self.executor.submit(_invoiceTTS, text_content,
self.model_instance,
self.tenant_id,
self.voice)
future_queue.put(futures_result)
if text_tmp:
self.msg_text = text_tmp
else:
self.msg_text = ''

except Exception as e:
self.logger.warning(e)
break
future_queue.put(None)

def checkAndGetAudio(self) -> AudioTrunk | None:
try:
if self._last_audio_event and self._last_audio_event.status == "finish":
if self.executor:
self.executor.shutdown(wait=False)
return self.last_message
audio = self._audio_queue.get_nowait()
if audio and audio.status == "finish":
self.executor.shutdown(wait=False)
self._runtime_thread = None
if audio:
self._last_audio_event = audio
return audio
except queue.Empty:
return None

def _extract_sentence(self, org_text):
tx = self.match.finditer(org_text)
start = 0
result = []
for i in tx:
end = i.regs[0][1]
result.append(org_text[start:end])
start = end
return result, org_text[start:]
Loading

0 comments on commit 6ef401a

Please sign in to comment.