支持edge_tts

shell-nlp · Dec 23, 2024 · 8ba4a84 · 8ba4a84
1 parent d682f61
commit 8ba4a84
Show file tree

Hide file tree

Showing 8 changed files with 183 additions and 103 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,9 +1,11 @@
 # FROM docker.rainbond.cc/506610466/cuda:12.2.0-runtime-ubuntu20.04-uv
-FROM 506610466/cuda:12.2.0-runtime-ubuntu20.04-uv
+# FROM 506610466/cuda:12.2.0-runtime-ubuntu20.04-uv
+# 从基础镜像开始构建，加快构建速度
+FROM 506610466/gpt_server:base 
 COPY ./ /gpt_server
 WORKDIR /gpt_server
-
-RUN uv venv --seed && uv sync && uv cache clean && \
-    echo '[[ -f .venv/bin/activate ]] && source .venv/bin/activate' >> ~/.bashrc
+RUN uv sync && uv cache clean
+# RUN uv venv --seed && uv sync && uv cache clean && \
+#     echo '[[ -f .venv/bin/activate ]] && source .venv/bin/activate' >> ~/.bashrc
 
 CMD ["/bin/bash"]
diff --git a/Dockerfile.copy b/Dockerfile.copy
@@ -1,4 +1,4 @@
-FROM docker.rainbond.cc/506610466/gpt_server:latest 
+FROM hub.geekery.cn/506610466/gpt_server:latest 
 
 COPY ./ /gpt_server
 

diff --git a/gpt_server/openai_api_protocol/custom_api_protocol.py b/gpt_server/openai_api_protocol/custom_api_protocol.py
@@ -14,6 +14,26 @@
 from pydantic import Field, BaseModel
 
 
+class SpeechRequest(BaseModel):
+    model: str = Field(
+        default="edge_tts", description="One of the available TTS models:"
+    )
+    input: str = Field(
+        description="The text to generate audio for. The maximum length is 4096 characters."
+    )
+    voice: str = Field(
+        default="zh-CN-YunxiNeural",
+        description="The voice to use when generating the audio",
+    )
+    response_format: Optional[str] = Field(
+        default="mp3", description="The format of the audio"
+    )
+    speed: Optional[float] = Field(
+        default=1.0,
+        description="The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.",
+    )
+
+
 class ModerationsRequest(BaseModel):
     input: Union[str, List[str]]
     model: str

diff --git a/gpt_server/serving/openai_api_server.py b/gpt_server/serving/openai_api_server.py
@@ -20,7 +20,7 @@
 from fastapi import Depends, HTTPException
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import StreamingResponse, JSONResponse
+from fastapi.responses import StreamingResponse, JSONResponse, FileResponse
 from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
 import httpx
 
@@ -699,7 +699,31 @@ async def generate_completion(payload: Dict[str, Any], worker_addr: str):
     CustomEmbeddingsRequest,
     RerankRequest,
     ModerationsRequest,
+    SpeechRequest,
 )
+import edge_tts
+import uuid
+
+OUTPUT_DIR = "./edge_tts_cache"
+
+
+@app.post("/v1/audio/speech", dependencies=[Depends(check_api_key)])
+async def speech(request: SpeechRequest):
+    os.makedirs(OUTPUT_DIR, exist_ok=True)  # 即使存在也不会报错
+    list_voices = await edge_tts.list_voices()
+    support_list_voices = [i["ShortName"] for i in list_voices]
+    if request.voice not in support_list_voices:
+        return JSONResponse(
+            ErrorResponse(
+                message=f"不支持voice:{request.voice}", code=ErrorCode.INVALID_MODEL
+            ).dict(),
+            status_code=400,
+        )
+    filename = f"{uuid.uuid4()}.mp3"
+    output_path = os.path.join(OUTPUT_DIR, filename)
+    communicate = edge_tts.Communicate(text=request.input, voice=request.voice)
+    await communicate.save(output_path)
+    return FileResponse(output_path, media_type="audio/mpeg", filename=filename)
 
 
 @app.post("/v1/moderations", dependencies=[Depends(check_api_key)])

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "qwen_vl_utils",
     "evalscope[perf]==0.7.0",
     "modelscope==1.20.1",
+    "edge-tts>=7.0.0",
 ]
 
 [tool.uv]
@@ -37,6 +38,10 @@ override-dependencies = [
 
 ]
 
+[[tool.uv.index]]
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+default = true
+
 [project.scripts]
 gpt_server = "gpt_server.cli:main"
 

diff --git a/requirements.txt b/requirements.txt
@@ -21,6 +21,7 @@ aiohappyeyeballs==2.4.4
 aiohttp==3.11.11
     # via
     #   datasets
+    #   edge-tts
     #   evalscope
     #   fschat
     #   fsspec
@@ -75,6 +76,7 @@ cachetools==5.5.0
     #   streamlit
 certifi==2024.12.14
     # via
+    #   edge-tts
     #   httpcore
     #   httpx
     #   requests
@@ -84,7 +86,7 @@ cffi==1.17.1
     #   soundfile
 charset-normalizer==3.4.0
     # via requests
-click==8.1.7
+click==8.1.8
     # via
     #   nltk
     #   ray
@@ -133,6 +135,8 @@ diskcache==5.6.3
     #   outlines
 distro==1.9.0
     # via openai
+edge-tts==7.0.0
+    # via gpt-server (pyproject.toml)
 editdistance==0.8.1
     # via evalscope
 einops==0.8.0
@@ -256,7 +260,7 @@ interegular==0.3.3
     #   outlines-core
 jieba==0.42.1
     # via evalscope
-jinja2==3.1.4
+jinja2==3.1.5
     # via
     #   altair
     #   gradio
@@ -741,6 +745,8 @@ sortedcontainers==2.4.0
     # via modelscope
 soundfile==0.12.1
     # via infinity-emb
+srt==3.5.3
+    # via edge-tts
 sse-starlette==2.1.3
     # via evalscope
 starlette==0.38.6
@@ -759,6 +765,7 @@ sympy==1.13.1
     #   torch
 tabulate==0.9.0
     # via
+    #   edge-tts
     #   evalscope
     #   sacrebleu
 tenacity==9.0.0
@@ -865,6 +872,7 @@ typing-extensions==4.12.2
     # via
     #   altair
     #   anyio
+    #   edge-tts
     #   fastapi
     #   gradio
     #   gradio-client
@@ -885,7 +893,7 @@ tzdata==2024.2
     # via pandas
 unicorn==2.1.1
     # via evalscope
-urllib3==2.2.3
+urllib3==2.3.0
     # via
     #   modelscope
     #   requests

diff --git a/tests/test_tts.py b/tests/test_tts.py
@@ -0,0 +1,12 @@
+from pathlib import Path
+from openai import OpenAI
+
+# 新版本 opnai
+client = OpenAI(api_key="EMPTY", base_url="http://localhost:8082/v1")
+speech_file_path = Path(__file__).parent / "speech.mp3"
+response = client.audio.speech.create(
+    model="edge_tts",
+    voice="zh-CN-YunxiNeural",
+    input="你好啊，我是人工智能。",
+)
+response.write_to_file(speech_file_path)