Skip to content

Commit 8ae7917

Browse files
authored
feat: Add audio parameter support to gemini tts models (#11287)
* feat: Add Gemini TTS audio parameter support - Add is_model_gemini_audio_model() method to detect TTS models - Include 'audio' parameter in supported params for TTS models - Map OpenAI audio parameter to Gemini speechConfig format - Add _extract_audio_response_from_parts() method to transform audio output to openai format * updated unit-test to use pcm16 * - created typedict for speechconfig - simplified gemini tts model detection - moved gemini_tts test to test_litellm * simplified is_model_gemini_audio_model more
1 parent 13dc757 commit 8ae7917

File tree

4 files changed

+349
-8
lines changed

4 files changed

+349
-8
lines changed

litellm/llms/gemini/chat/transformation.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
convert_to_anthropic_image_obj,
77
)
88
from litellm.types.llms.openai import AllMessageValues
9-
from litellm.types.llms.vertex_ai import ContentType, PartType
9+
from litellm.types.llms.vertex_ai import ContentType, PartType, SpeechConfig, VoiceConfig, PrebuiltVoiceConfig
1010
from litellm.utils import supports_reasoning
1111

1212
from ...vertex_ai.gemini.transformation import _gemini_convert_messages_with_history
@@ -67,6 +67,9 @@ def __init__(
6767
def get_config(cls):
6868
return super().get_config()
6969

70+
def is_model_gemini_audio_model(self, model: str) -> bool:
71+
return "tts" in model
72+
7073
def get_supported_openai_params(self, model: str) -> List[str]:
7174
supported_params = [
7275
"temperature",
@@ -89,6 +92,8 @@ def get_supported_openai_params(self, model: str) -> List[str]:
8992
if supports_reasoning(model):
9093
supported_params.append("reasoning_effort")
9194
supported_params.append("thinking")
95+
if self.is_model_gemini_audio_model(model):
96+
supported_params.append("audio")
9297
return supported_params
9398

9499
def map_openai_params(
@@ -98,6 +103,40 @@ def map_openai_params(
98103
model: str,
99104
drop_params: bool,
100105
) -> Dict:
106+
# Handle audio parameter for TTS models
107+
if self.is_model_gemini_audio_model(model):
108+
for param, value in non_default_params.items():
109+
if param == "audio" and isinstance(value, dict):
110+
# Validate audio format - Gemini TTS only supports pcm16
111+
audio_format = value.get("format")
112+
if audio_format is not None and audio_format != "pcm16":
113+
raise ValueError(
114+
f"Unsupported audio format for Gemini TTS models: {audio_format}. "
115+
f"Gemini TTS models only support 'pcm16' format as they return audio data in L16 PCM format. "
116+
f"Please set audio format to 'pcm16'."
117+
)
118+
119+
# Map OpenAI audio parameter to Gemini speech config
120+
speech_config: SpeechConfig = {}
121+
122+
if "voice" in value:
123+
prebuilt_voice_config: PrebuiltVoiceConfig = {
124+
"voiceName": value["voice"]
125+
}
126+
voice_config: VoiceConfig = {
127+
"prebuiltVoiceConfig": prebuilt_voice_config
128+
}
129+
speech_config["voiceConfig"] = voice_config
130+
131+
if speech_config:
132+
optional_params["speechConfig"] = speech_config
133+
134+
# Ensure audio modality is set
135+
if "responseModalities" not in optional_params:
136+
optional_params["responseModalities"] = ["AUDIO"]
137+
elif "AUDIO" not in optional_params["responseModalities"]:
138+
optional_params["responseModalities"].append("AUDIO")
139+
101140
if litellm.vertex_ai_safety_settings is not None:
102141
optional_params["safety_settings"] = litellm.vertex_ai_safety_settings
103142
return super().map_openai_params(

litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py

Lines changed: 75 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
## httpx client for vertex ai calls
33
## Initial implementation - covers gemini + image gen calls
44
import json
5+
import time
56
import uuid
67
from copy import deepcopy
78
from functools import partial
@@ -61,6 +62,7 @@
6162
UsageMetadata,
6263
)
6364
from litellm.types.utils import (
65+
ChatCompletionAudioResponse,
6466
ChatCompletionTokenLogprob,
6567
ChoiceLogprobs,
6668
CompletionTokensDetailsWrapper,
@@ -69,7 +71,7 @@
6971
TopLogprob,
7072
Usage,
7173
)
72-
from litellm.utils import CustomStreamWrapper, ModelResponse, supports_reasoning
74+
from litellm.utils import CustomStreamWrapper, ModelResponse, is_base64_encoded, supports_reasoning
7375

7476
from ....utils import _remove_additional_properties, _remove_strict_from_schema
7577
from ..common_utils import VertexAIError, _build_vertex_schema
@@ -676,14 +678,30 @@ def get_assistant_content_message(
676678
) -> Tuple[Optional[str], Optional[str]]:
677679
content_str: Optional[str] = None
678680
reasoning_content_str: Optional[str] = None
681+
679682
for part in parts:
680683
_content_str = ""
681684
if "text" in part:
682-
_content_str += part["text"]
683-
elif "inlineData" in part: # base64 encoded image
684-
_content_str += "data:{};base64,{}".format(
685-
part["inlineData"]["mimeType"], part["inlineData"]["data"]
686-
)
685+
text_content = part["text"]
686+
# Check if text content is audio data URI - if so, exclude from text content
687+
if text_content.startswith("data:audio") and ";base64," in text_content:
688+
try:
689+
if is_base64_encoded(text_content):
690+
media_type, _ = text_content.split("data:")[1].split(";base64,")
691+
if media_type.startswith("audio/"):
692+
continue
693+
except (ValueError, IndexError):
694+
# If parsing fails, treat as regular text
695+
pass
696+
_content_str += text_content
697+
elif "inlineData" in part:
698+
mime_type = part["inlineData"]["mimeType"]
699+
data = part["inlineData"]["data"]
700+
# Check if inline data is audio - if so, exclude from text content
701+
if mime_type.startswith("audio/"):
702+
continue
703+
_content_str += "data:{};base64,{}".format(mime_type, data)
704+
687705
if len(_content_str) > 0:
688706
if part.get("thought") is True:
689707
if reasoning_content_str is None:
@@ -696,6 +714,47 @@ def get_assistant_content_message(
696714

697715
return content_str, reasoning_content_str
698716

717+
def _extract_audio_response_from_parts(
718+
self, parts: List[HttpxPartType]
719+
) -> Optional[ChatCompletionAudioResponse]:
720+
"""Extract audio response from parts if present"""
721+
for part in parts:
722+
if "text" in part:
723+
text_content = part["text"]
724+
# Check if text content contains audio data URI
725+
if text_content.startswith("data:audio") and ";base64," in text_content:
726+
try:
727+
if is_base64_encoded(text_content):
728+
media_type, audio_data = text_content.split("data:")[1].split(";base64,")
729+
730+
if media_type.startswith("audio/"):
731+
expires_at = int(time.time()) + (24 * 60 * 60)
732+
transcript = "" # Gemini doesn't provide transcript
733+
734+
return ChatCompletionAudioResponse(
735+
data=audio_data,
736+
expires_at=expires_at,
737+
transcript=transcript
738+
)
739+
except (ValueError, IndexError):
740+
pass
741+
742+
elif "inlineData" in part:
743+
mime_type = part["inlineData"]["mimeType"]
744+
data = part["inlineData"]["data"]
745+
746+
if mime_type.startswith("audio/"):
747+
expires_at = int(time.time()) + (24 * 60 * 60)
748+
transcript = "" # Gemini doesn't provide transcript
749+
750+
return ChatCompletionAudioResponse(
751+
data=data,
752+
expires_at=expires_at,
753+
transcript=transcript
754+
)
755+
756+
return None
757+
699758
def _transform_parts(
700759
self,
701760
parts: List[HttpxPartType],
@@ -981,8 +1040,17 @@ def _process_candidates(
9811040
) = VertexGeminiConfig().get_assistant_content_message(
9821041
parts=candidate["content"]["parts"]
9831042
)
984-
if content is not None:
1043+
1044+
audio_response = VertexGeminiConfig()._extract_audio_response_from_parts(
1045+
parts=candidate["content"]["parts"]
1046+
)
1047+
1048+
if audio_response is not None:
1049+
cast(Dict[str, Any], chat_completion_message)["audio"] = audio_response
1050+
chat_completion_message["content"] = None # OpenAI spec
1051+
elif content is not None:
9851052
chat_completion_message["content"] = content
1053+
9861054
if reasoning_content is not None:
9871055
chat_completion_message["reasoning_content"] = reasoning_content
9881056

litellm/types/llms/vertex_ai.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,18 @@ class GeminiThinkingConfig(TypedDict, total=False):
176176
GeminiResponseModalities = Literal["TEXT", "IMAGE", "AUDIO", "VIDEO"]
177177

178178

179+
class PrebuiltVoiceConfig(TypedDict):
180+
voiceName: str
181+
182+
183+
class VoiceConfig(TypedDict):
184+
prebuiltVoiceConfig: PrebuiltVoiceConfig
185+
186+
187+
class SpeechConfig(TypedDict, total=False):
188+
voiceConfig: VoiceConfig
189+
190+
179191
class GenerationConfig(TypedDict, total=False):
180192
temperature: float
181193
top_p: float
@@ -252,6 +264,7 @@ class RequestBody(TypedDict, total=False):
252264
safetySettings: List[SafetSettingsConfig]
253265
generationConfig: GenerationConfig
254266
cachedContent: str
267+
speechConfig: SpeechConfig
255268

256269

257270
class CachedContentRequestBody(TypedDict, total=False):

0 commit comments

Comments
 (0)