2
2
## httpx client for vertex ai calls
3
3
## Initial implementation - covers gemini + image gen calls
4
4
import json
5
+ import time
5
6
import uuid
6
7
from copy import deepcopy
7
8
from functools import partial
61
62
UsageMetadata ,
62
63
)
63
64
from litellm .types .utils import (
65
+ ChatCompletionAudioResponse ,
64
66
ChatCompletionTokenLogprob ,
65
67
ChoiceLogprobs ,
66
68
CompletionTokensDetailsWrapper ,
69
71
TopLogprob ,
70
72
Usage ,
71
73
)
72
- from litellm .utils import CustomStreamWrapper , ModelResponse , supports_reasoning
74
+ from litellm .utils import CustomStreamWrapper , ModelResponse , is_base64_encoded , supports_reasoning
73
75
74
76
from ....utils import _remove_additional_properties , _remove_strict_from_schema
75
77
from ..common_utils import VertexAIError , _build_vertex_schema
@@ -676,14 +678,30 @@ def get_assistant_content_message(
676
678
) -> Tuple [Optional [str ], Optional [str ]]:
677
679
content_str : Optional [str ] = None
678
680
reasoning_content_str : Optional [str ] = None
681
+
679
682
for part in parts :
680
683
_content_str = ""
681
684
if "text" in part :
682
- _content_str += part ["text" ]
683
- elif "inlineData" in part : # base64 encoded image
684
- _content_str += "data:{};base64,{}" .format (
685
- part ["inlineData" ]["mimeType" ], part ["inlineData" ]["data" ]
686
- )
685
+ text_content = part ["text" ]
686
+ # Check if text content is audio data URI - if so, exclude from text content
687
+ if text_content .startswith ("data:audio" ) and ";base64," in text_content :
688
+ try :
689
+ if is_base64_encoded (text_content ):
690
+ media_type , _ = text_content .split ("data:" )[1 ].split (";base64," )
691
+ if media_type .startswith ("audio/" ):
692
+ continue
693
+ except (ValueError , IndexError ):
694
+ # If parsing fails, treat as regular text
695
+ pass
696
+ _content_str += text_content
697
+ elif "inlineData" in part :
698
+ mime_type = part ["inlineData" ]["mimeType" ]
699
+ data = part ["inlineData" ]["data" ]
700
+ # Check if inline data is audio - if so, exclude from text content
701
+ if mime_type .startswith ("audio/" ):
702
+ continue
703
+ _content_str += "data:{};base64,{}" .format (mime_type , data )
704
+
687
705
if len (_content_str ) > 0 :
688
706
if part .get ("thought" ) is True :
689
707
if reasoning_content_str is None :
@@ -696,6 +714,47 @@ def get_assistant_content_message(
696
714
697
715
return content_str , reasoning_content_str
698
716
717
+ def _extract_audio_response_from_parts (
718
+ self , parts : List [HttpxPartType ]
719
+ ) -> Optional [ChatCompletionAudioResponse ]:
720
+ """Extract audio response from parts if present"""
721
+ for part in parts :
722
+ if "text" in part :
723
+ text_content = part ["text" ]
724
+ # Check if text content contains audio data URI
725
+ if text_content .startswith ("data:audio" ) and ";base64," in text_content :
726
+ try :
727
+ if is_base64_encoded (text_content ):
728
+ media_type , audio_data = text_content .split ("data:" )[1 ].split (";base64," )
729
+
730
+ if media_type .startswith ("audio/" ):
731
+ expires_at = int (time .time ()) + (24 * 60 * 60 )
732
+ transcript = "" # Gemini doesn't provide transcript
733
+
734
+ return ChatCompletionAudioResponse (
735
+ data = audio_data ,
736
+ expires_at = expires_at ,
737
+ transcript = transcript
738
+ )
739
+ except (ValueError , IndexError ):
740
+ pass
741
+
742
+ elif "inlineData" in part :
743
+ mime_type = part ["inlineData" ]["mimeType" ]
744
+ data = part ["inlineData" ]["data" ]
745
+
746
+ if mime_type .startswith ("audio/" ):
747
+ expires_at = int (time .time ()) + (24 * 60 * 60 )
748
+ transcript = "" # Gemini doesn't provide transcript
749
+
750
+ return ChatCompletionAudioResponse (
751
+ data = data ,
752
+ expires_at = expires_at ,
753
+ transcript = transcript
754
+ )
755
+
756
+ return None
757
+
699
758
def _transform_parts (
700
759
self ,
701
760
parts : List [HttpxPartType ],
@@ -981,8 +1040,17 @@ def _process_candidates(
981
1040
) = VertexGeminiConfig ().get_assistant_content_message (
982
1041
parts = candidate ["content" ]["parts" ]
983
1042
)
984
- if content is not None :
1043
+
1044
+ audio_response = VertexGeminiConfig ()._extract_audio_response_from_parts (
1045
+ parts = candidate ["content" ]["parts" ]
1046
+ )
1047
+
1048
+ if audio_response is not None :
1049
+ cast (Dict [str , Any ], chat_completion_message )["audio" ] = audio_response
1050
+ chat_completion_message ["content" ] = None # OpenAI spec
1051
+ elif content is not None :
985
1052
chat_completion_message ["content" ] = content
1053
+
986
1054
if reasoning_content is not None :
987
1055
chat_completion_message ["reasoning_content" ] = reasoning_content
988
1056
0 commit comments