-
Notifications
You must be signed in to change notification settings - Fork 2.8k
v3:stt and tts models #4603
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
v3:stt and tts models #4603
Changes from all commits
6f83e4a
d74ce83
6fec239
db17cc3
cb128a3
77c6622
727e25d
10032db
d0fbb4e
388c982
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,7 +53,8 @@ | |
| SARVAM_STT_TRANSLATE_STREAMING_URL = "wss://api.sarvam.ai/speech-to-text-translate/ws" | ||
|
|
||
| # Models | ||
| SarvamSTTModels = Literal["saarika:v2.5", "saarika:v2.0", "saaras:v2.5"] | ||
| SarvamSTTModels = Literal["saarika:v2.5", "saaras:v2.5", "saaras:v3"] | ||
| SarvamSTTModes = Literal["transcribe", "translate", "verbatim", "translit", "codemix"] | ||
|
|
||
|
|
||
| class ConnectionState(enum.Enum): | ||
|
|
@@ -73,6 +74,7 @@ class SarvamSTTOptions: | |
| Args: | ||
| language: BCP-47 language code, e.g., "hi-IN", "en-IN" | ||
| model: The Sarvam STT model to use | ||
| mode: Mode for saaras:v3 (transcribe/translate/verbatim/translit/codemix) | ||
| base_url: API endpoint URL (auto-determined from model if not provided) | ||
| streaming_url: WebSocket streaming URL (auto-determined from model if not provided) | ||
| prompt: Optional prompt for STT translate (saaras models only) | ||
|
|
@@ -81,6 +83,7 @@ class SarvamSTTOptions: | |
| language: str # BCP-47 language code, e.g., "hi-IN", "en-IN" | ||
| api_key: str | ||
| model: SarvamSTTModels | str = "saarika:v2.5" | ||
| mode: SarvamSTTModes | str = "transcribe" | ||
| base_url: str | None = None | ||
| streaming_url: str | None = None | ||
| prompt: str | None = None # Optional prompt for STT translate (saaras models only) | ||
|
|
@@ -97,6 +100,20 @@ def __post_init__(self) -> None: | |
| self.base_url = base_url | ||
| if self.streaming_url is None: | ||
| self.streaming_url = streaming_url | ||
| if self.model == "saaras:v3": | ||
| allowed_modes: set[str] = { | ||
| "transcribe", | ||
| "translate", | ||
| "verbatim", | ||
| "translit", | ||
| "codemix", | ||
| } | ||
| if self.mode not in allowed_modes: | ||
| raise ValueError( | ||
| "mode must be one of transcribe, translate, verbatim, translit, codemix" | ||
| ) | ||
| else: | ||
| self.mode = "transcribe" | ||
| if self.sample_rate <= 0: | ||
| raise ValueError("sample_rate must be greater than zero") | ||
|
|
||
|
|
@@ -110,7 +127,7 @@ def _get_urls_for_model(model: str) -> tuple[str, str]: | |
| Returns: | ||
| Tuple of (base_url, streaming_url) | ||
| """ | ||
| if model.startswith("saaras:"): | ||
| if model.startswith("saaras:v2.5"): | ||
| return SARVAM_STT_TRANSLATE_BASE_URL, SARVAM_STT_TRANSLATE_STREAMING_URL | ||
| else: # saarika models | ||
| return SARVAM_STT_BASE_URL, SARVAM_STT_STREAMING_URL | ||
|
Comment on lines
+130
to
133
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '120,145p'Repository: livekit/agents Length of output: 1151 🏁 Script executed: # Search for any documentation or comments about saaras models and endpoints
rg -A 5 -B 5 "saaras" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/ --context=3Repository: livekit/agents Length of output: 12474 🏁 Script executed: # Check for tests that might indicate which endpoint should be used for which model
fd -t f "test.*\.py" livekit-plugins/livekit-plugins-sarvam/ -exec grep -l "saaras\|_get_urls_for_model" {} \;Repository: livekit/agents Length of output: 279 🏁 Script executed: # Look for test files or examples
fd -t f "test" livekit-plugins/livekit-plugins-sarvam/ | head -20Repository: livekit/agents Length of output: 40 🏁 Script executed: # Search for README or documentation
fd -t f -e md livekit-plugins/livekit-plugins-sarvam/Repository: livekit/agents Length of output: 519 🏁 Script executed: # Look for any API endpoint documentation or comments in the codebase
rg -i "saaras.*endpoint\|translate.*endpoint\|speech.*to.*text" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py -A 2 -B 2Repository: livekit/agents Length of output: 40 🏁 Script executed: # Check how the endpoints are actually used in requests
rg -A 10 "self._opts.base_url\|base_url.*=\|self\._base_url" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.pyRepository: livekit/agents Length of output: 40 🏁 Script executed: cd livekit-plugins/livekit-plugins-sarvam && find . -type f -name "*test*" -o -name "*example*" | head -20Repository: livekit/agents Length of output: 40 🏁 Script executed: # Look for README or documentation about API endpoints
cd livekit-plugins/livekit-plugins-sarvam && find . -type f -name "*.md"Repository: livekit/agents Length of output: 69 🏁 Script executed: # Check the full stt.py file around the endpoint definitions
cd livekit-plugins/livekit-plugins-sarvam && head -50 livekit/plugins/sarvam/stt.pyRepository: livekit/agents Length of output: 1424 🏁 Script executed: # Search for how saaras:v3 mode is used and what endpoints it expects
rg "saaras:v3" livekit-plugins/livekit-plugins-sarvam/Repository: livekit/agents Length of output: 1736 🏁 Script executed: cat livekit-plugins/livekit-plugins-sarvam/README.mdRepository: livekit/agents Length of output: 756 🏁 Script executed: # Look for how prompts are used with different models
rg -A 5 -B 5 "prompt" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | head -60Repository: livekit/agents Length of output: 2351 🏁 Script executed: # Check if saaras:v3 actually uses the prompt or just mode
rg "saaras:v3.*prompt\|prompt.*saaras:v3" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.pyRepository: livekit/agents Length of output: 40 🏁 Script executed: # Look at how parameters are sent for different models
rg -A 10 "form_data.*saaras\|if.*model.*saaras" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.pyRepository: livekit/agents Length of output: 40 🏁 Script executed: # Find the full context around how form_data and endpoints are used
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '170,240p'Repository: livekit/agents Length of output: 3145 🏁 Script executed: # Look for the async _recognize_http method where the endpoint is actually used
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '155,200p'Repository: livekit/agents Length of output: 1977 🏁 Script executed: # Check for WebSocket connection and how it uses the URL
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '340,420p'Repository: livekit/agents Length of output: 4378 🏁 Script executed: # Search the entire file for where _get_urls_for_model is called
rg -n "_get_urls_for_model" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.pyRepository: livekit/agents Length of output: 277 🏁 Script executed: # Look at the initialization code to see how base_url and streaming_url are determined
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '570,590p'Repository: livekit/agents Length of output: 1097 🏁 Script executed: # Check where the URL is actually used to make requests
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '430,470p'Repository: livekit/agents Length of output: 1584 🏁 Script executed: # Look for any comments about saaras:v3 or endpoint documentation
grep -n "saaras\|endpoint\|translate" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | head -30Repository: livekit/agents Length of output: 2122 🏁 Script executed: # Look for tests or examples showing how saaras:v3 should work
find livekit-plugins/livekit-plugins-sarvam -name "*.py" -type f | xargs grep -l "test\|example" | head -5Repository: livekit/agents Length of output: 40 🏁 Script executed: # Check if there are any comments or documentation about endpoint selection for saaras:v3
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '1,130p'Repository: livekit/agents Length of output: 5244 🏁 Script executed: # Look for any API documentation or comments about mode and endpoint
rg -B 10 "speech-to-text-translate" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | head -40Repository: livekit/agents Length of output: 549 🏁 Script executed: # Look at the recognize_http method to see how base_url is used
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '245,290p'Repository: livekit/agents Length of output: 2159
The condition Change the condition to 🤖 Prompt for AI Agents
dhruvladia-sarvam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
@@ -151,6 +168,8 @@ def _build_websocket_url(base_url: str, opts: SarvamSTTOptions) -> str: | |
| params["high_vad_sensitivity"] = str(opts.high_vad_sensitivity).lower() | ||
| if opts.flush_signal is not None: | ||
| params["flush_signal"] = str(opts.flush_signal).lower() | ||
| if opts.model == "saaras:v3": | ||
| params["mode"] = opts.mode | ||
| if opts.input_audio_codec: | ||
| params["input_audio_codec"] = opts.input_audio_codec | ||
|
|
||
|
|
@@ -166,6 +185,7 @@ class STT(stt.STT): | |
| Args: | ||
| language: BCP-47 language code, e.g., "hi-IN", "en-IN" | ||
| model: The Sarvam STT model to use | ||
| mode: Mode for saaras:v3 (transcribe/translate/verbatim/translit/codemix) | ||
| api_key: Sarvam.ai API key (falls back to SARVAM_API_KEY env var) | ||
| base_url: API endpoint URL | ||
| http_session: Optional aiohttp session to use | ||
|
|
@@ -177,6 +197,7 @@ def __init__( | |
| *, | ||
| language: str = "en-IN", | ||
| model: SarvamSTTModels | str = "saarika:v2.5", | ||
| mode: SarvamSTTModes | str = "transcribe", | ||
| api_key: str | None = None, | ||
| base_url: str | None = None, | ||
| http_session: aiohttp.ClientSession | None = None, | ||
|
|
@@ -206,6 +227,7 @@ def __init__( | |
| language=language, | ||
| api_key=self._api_key, | ||
| model=model, | ||
| mode=mode, | ||
| base_url=base_url, | ||
| prompt=prompt, | ||
| high_vad_sensitivity=high_vad_sensitivity, | ||
|
|
@@ -236,6 +258,7 @@ async def _recognize_impl( | |
| *, | ||
| language: NotGivenOr[str] = NOT_GIVEN, | ||
| model: NotGivenOr[SarvamSTTModels | str] = NOT_GIVEN, | ||
| mode: NotGivenOr[SarvamSTTModes | str] = NOT_GIVEN, | ||
| conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, | ||
| ) -> stt.SpeechEvent: | ||
| """Recognize speech using Sarvam.ai API. | ||
|
|
@@ -254,8 +277,11 @@ async def _recognize_impl( | |
| APIStatusError: On API errors (non-200 status) | ||
| APITimeoutError: On API timeout | ||
| """ | ||
| opts_language = self._opts.language if isinstance(language, type(NOT_GIVEN)) else language | ||
| opts_model = self._opts.model if isinstance(model, type(NOT_GIVEN)) else model | ||
| opts_language = self._opts.language if not is_given(language) else language | ||
| opts_model = self._opts.model if not is_given(model) else model | ||
| opts_mode = self._opts.mode if not is_given(mode) else mode | ||
| if is_given(mode) and opts_model != "saaras:v3": | ||
| raise ValueError("mode is only supported when model is saaras:v3") | ||
|
|
||
| wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes() | ||
|
|
||
|
|
@@ -269,6 +295,8 @@ async def _recognize_impl( | |
| form_data.add_field("language_code", opts_language) | ||
| if opts_model: | ||
| form_data.add_field("model", str(opts_model)) | ||
| if opts_model == "saaras:v3": | ||
| form_data.add_field("mode", str(opts_mode)) | ||
|
|
||
| if not self._api_key: | ||
| raise ValueError("API key cannot be None") | ||
|
|
@@ -351,6 +379,7 @@ def stream( | |
| *, | ||
| language: NotGivenOr[str] = NOT_GIVEN, | ||
| model: NotGivenOr[SarvamSTTModels | str] = NOT_GIVEN, | ||
| mode: NotGivenOr[SarvamSTTModes | str] = NOT_GIVEN, | ||
| conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, | ||
| prompt: NotGivenOr[str] = NOT_GIVEN, | ||
| high_vad_sensitivity: NotGivenOr[bool] = NOT_GIVEN, | ||
|
|
@@ -361,11 +390,16 @@ def stream( | |
| """Create a streaming transcription session.""" | ||
| opts_language = language if is_given(language) else self._opts.language | ||
| opts_model = model if is_given(model) else self._opts.model | ||
| opts_mode = mode if is_given(mode) else self._opts.mode | ||
| if is_given(mode) and opts_model != "saaras:v3": | ||
| raise ValueError("mode is only supported when model is saaras:v3") | ||
|
|
||
| if not isinstance(opts_language, str): | ||
| opts_language = self._opts.language | ||
| if not isinstance(opts_model, str): | ||
| opts_model = self._opts.model | ||
| if not isinstance(opts_mode, str): | ||
| opts_mode = self._opts.mode | ||
|
|
||
| # Handle prompt conversion from NotGiven to None | ||
| final_prompt: str | None | ||
|
|
@@ -390,6 +424,7 @@ def stream( | |
| language=opts_language, | ||
| api_key=self._api_key if self._api_key else "", | ||
| model=opts_model, | ||
| mode=opts_mode, | ||
| prompt=final_prompt, | ||
| high_vad_sensitivity=opts_high_vad, | ||
| sample_rate=opts_sample_rate, | ||
|
|
@@ -524,24 +559,50 @@ async def aclose(self) -> None: | |
| # Clear reference to help with garbage collection | ||
| pass # Session reference will be cleared when object is destroyed | ||
|
|
||
| def update_options(self, *, language: str, model: str, prompt: str | None = None) -> None: | ||
| def update_options( | ||
| self, | ||
| *, | ||
| language: str, | ||
| model: str, | ||
| prompt: str | None = None, | ||
| mode: str | None = None, | ||
| ) -> None: | ||
| """Update streaming options.""" | ||
| if not language or not language.strip(): | ||
| raise ValueError("Language cannot be empty") | ||
| if not model or not model.strip(): | ||
| raise ValueError("Model cannot be empty") | ||
|
|
||
| self._opts.language = language | ||
| self._opts.model = model | ||
| self._opts.base_url, self._opts.streaming_url = _get_urls_for_model(model) | ||
| if prompt is not None: | ||
| self._opts.prompt = prompt | ||
| if mode is not None and model != "saaras:v3": | ||
| raise ValueError("mode is only supported when model is saaras:v3") | ||
| if model == "saaras:v3": | ||
| allowed_modes: set[str] = { | ||
| "transcribe", | ||
| "translate", | ||
| "verbatim", | ||
| "translit", | ||
| "codemix", | ||
| } | ||
| if mode is not None and mode not in allowed_modes: | ||
| raise ValueError( | ||
| "mode must be one of transcribe, translate, verbatim, translit, codemix" | ||
| ) | ||
| if mode is not None: | ||
| self._opts.mode = mode | ||
| else: | ||
| self._opts.mode = "transcribe" | ||
| self._logger.info( | ||
| "Options updated, triggering reconnection", | ||
| extra={ | ||
| "session_id": self._session_id, | ||
| "language": language, | ||
| "model": model, | ||
| "prompt": prompt, | ||
| "mode": self._opts.mode, | ||
| }, | ||
| ) | ||
| self._reconnect_event.set() | ||
|
|
||
devin-ai-integration[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is the right url for
saaras:v3?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"https://api.sarvam.ai/speech-to-text" and "wss://api.sarvam.ai/speech-to-text/ws" for rest and websocket respectively, that is set in variables SARVAM_STT_BASE_URL and SARVAM_STT_STREAMING_URL respectively
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
got it, the comment is a bit confusing.. since it says
else: # saarika models, butsaaras:v3isn't a saarika model?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is an old comment, I will modify it