Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@
SARVAM_STT_TRANSLATE_STREAMING_URL = "wss://api.sarvam.ai/speech-to-text-translate/ws"

# Models
SarvamSTTModels = Literal["saarika:v2.5", "saarika:v2.0", "saaras:v2.5"]
SarvamSTTModels = Literal["saarika:v2.5", "saaras:v2.5", "saaras:v3"]
SarvamSTTModes = Literal["transcribe", "translate", "verbatim", "translit", "codemix"]


class ConnectionState(enum.Enum):
Expand All @@ -73,6 +74,7 @@ class SarvamSTTOptions:
Args:
language: BCP-47 language code, e.g., "hi-IN", "en-IN"
model: The Sarvam STT model to use
mode: Mode for saaras:v3 (transcribe/translate/verbatim/translit/codemix)
base_url: API endpoint URL (auto-determined from model if not provided)
streaming_url: WebSocket streaming URL (auto-determined from model if not provided)
prompt: Optional prompt for STT translate (saaras models only)
Expand All @@ -81,6 +83,7 @@ class SarvamSTTOptions:
language: str # BCP-47 language code, e.g., "hi-IN", "en-IN"
api_key: str
model: SarvamSTTModels | str = "saarika:v2.5"
mode: SarvamSTTModes | str = "transcribe"
base_url: str | None = None
streaming_url: str | None = None
prompt: str | None = None # Optional prompt for STT translate (saaras models only)
Expand All @@ -97,6 +100,20 @@ def __post_init__(self) -> None:
self.base_url = base_url
if self.streaming_url is None:
self.streaming_url = streaming_url
if self.model == "saaras:v3":
allowed_modes: set[str] = {
"transcribe",
"translate",
"verbatim",
"translit",
"codemix",
}
if self.mode not in allowed_modes:
raise ValueError(
"mode must be one of transcribe, translate, verbatim, translit, codemix"
)
else:
self.mode = "transcribe"
if self.sample_rate <= 0:
raise ValueError("sample_rate must be greater than zero")

Expand All @@ -110,7 +127,7 @@ def _get_urls_for_model(model: str) -> tuple[str, str]:
Returns:
Tuple of (base_url, streaming_url)
"""
if model.startswith("saaras:"):
if model.startswith("saaras:v2.5"):
return SARVAM_STT_TRANSLATE_BASE_URL, SARVAM_STT_TRANSLATE_STREAMING_URL
else: # saarika models
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the right url for saaras:v3?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"https://api.sarvam.ai/speech-to-text" and "wss://api.sarvam.ai/speech-to-text/ws" for rest and websocket respectively, that is set in variables SARVAM_STT_BASE_URL and SARVAM_STT_STREAMING_URL respectively

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

got it, the comment is a bit confusing.. since it says else: # saarika models, but saaras:v3 isn't a saarika model?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is an old comment, I will modify it

return SARVAM_STT_BASE_URL, SARVAM_STT_STREAMING_URL
Comment on lines +130 to 133
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '120,145p'

Repository: livekit/agents

Length of output: 1151


🏁 Script executed:

# Search for any documentation or comments about saaras models and endpoints
rg -A 5 -B 5 "saaras" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/ --context=3

Repository: livekit/agents

Length of output: 12474


🏁 Script executed:

# Check for tests that might indicate which endpoint should be used for which model
fd -t f "test.*\.py" livekit-plugins/livekit-plugins-sarvam/ -exec grep -l "saaras\|_get_urls_for_model" {} \;

Repository: livekit/agents

Length of output: 279


🏁 Script executed:

# Look for test files or examples
fd -t f "test" livekit-plugins/livekit-plugins-sarvam/ | head -20

Repository: livekit/agents

Length of output: 40


🏁 Script executed:

# Search for README or documentation
fd -t f -e md livekit-plugins/livekit-plugins-sarvam/

Repository: livekit/agents

Length of output: 519


🏁 Script executed:

# Look for any API endpoint documentation or comments in the codebase
rg -i "saaras.*endpoint\|translate.*endpoint\|speech.*to.*text" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py -A 2 -B 2

Repository: livekit/agents

Length of output: 40


🏁 Script executed:

# Check how the endpoints are actually used in requests
rg -A 10 "self._opts.base_url\|base_url.*=\|self\._base_url" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py

Repository: livekit/agents

Length of output: 40


🏁 Script executed:

cd livekit-plugins/livekit-plugins-sarvam && find . -type f -name "*test*" -o -name "*example*" | head -20

Repository: livekit/agents

Length of output: 40


🏁 Script executed:

# Look for README or documentation about API endpoints
cd livekit-plugins/livekit-plugins-sarvam && find . -type f -name "*.md"

Repository: livekit/agents

Length of output: 69


🏁 Script executed:

# Check the full stt.py file around the endpoint definitions
cd livekit-plugins/livekit-plugins-sarvam && head -50 livekit/plugins/sarvam/stt.py

Repository: livekit/agents

Length of output: 1424


🏁 Script executed:

# Search for how saaras:v3 mode is used and what endpoints it expects
rg "saaras:v3" livekit-plugins/livekit-plugins-sarvam/

Repository: livekit/agents

Length of output: 1736


🏁 Script executed:

cat livekit-plugins/livekit-plugins-sarvam/README.md

Repository: livekit/agents

Length of output: 756


🏁 Script executed:

# Look for how prompts are used with different models
rg -A 5 -B 5 "prompt" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | head -60

Repository: livekit/agents

Length of output: 2351


🏁 Script executed:

# Check if saaras:v3 actually uses the prompt or just mode
rg "saaras:v3.*prompt\|prompt.*saaras:v3" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py

Repository: livekit/agents

Length of output: 40


🏁 Script executed:

# Look at how parameters are sent for different models
rg -A 10 "form_data.*saaras\|if.*model.*saaras" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py

Repository: livekit/agents

Length of output: 40


🏁 Script executed:

# Find the full context around how form_data and endpoints are used
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '170,240p'

Repository: livekit/agents

Length of output: 3145


🏁 Script executed:

# Look for the async _recognize_http method where the endpoint is actually used
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '155,200p'

Repository: livekit/agents

Length of output: 1977


🏁 Script executed:

# Check for WebSocket connection and how it uses the URL
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '340,420p'

Repository: livekit/agents

Length of output: 4378


🏁 Script executed:

# Search the entire file for where _get_urls_for_model is called
rg -n "_get_urls_for_model" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py

Repository: livekit/agents

Length of output: 277


🏁 Script executed:

# Look at the initialization code to see how base_url and streaming_url are determined
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '570,590p'

Repository: livekit/agents

Length of output: 1097


🏁 Script executed:

# Check where the URL is actually used to make requests
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '430,470p'

Repository: livekit/agents

Length of output: 1584


🏁 Script executed:

# Look for any comments about saaras:v3 or endpoint documentation
grep -n "saaras\|endpoint\|translate" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | head -30

Repository: livekit/agents

Length of output: 2122


🏁 Script executed:

# Look for tests or examples showing how saaras:v3 should work
find livekit-plugins/livekit-plugins-sarvam -name "*.py" -type f | xargs grep -l "test\|example" | head -5

Repository: livekit/agents

Length of output: 40


🏁 Script executed:

# Check if there are any comments or documentation about endpoint selection for saaras:v3
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '1,130p'

Repository: livekit/agents

Length of output: 5244


🏁 Script executed:

# Look for any API documentation or comments about mode and endpoint
rg -B 10 "speech-to-text-translate" livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | head -40

Repository: livekit/agents

Length of output: 549


🏁 Script executed:

# Look at the recognize_http method to see how base_url is used
cat -n livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py | sed -n '245,290p'

Repository: livekit/agents

Length of output: 2159


saaras:v3 is routed to the wrong endpoint.

The condition model.startswith("saaras:v2.5") does not match saaras:v3, causing it to be routed to the base STT endpoint instead of the translate endpoint. Since saaras:v3 supports the translate mode and accepts prompts (both documented as translate features, just like saaras:v2.5), it should use the translate endpoint. Additionally, the comment # saarika models is misleading—the else branch catches saaras:v3 as well.

Change the condition to model.startswith("saaras") to route both saaras:v2.5 and saaras:v3 to the translate endpoint, and update the comment to clarify it only covers saarika models.

🤖 Prompt for AI Agents
In `@livekit-plugins/livekit-plugins-sarvam/livekit/plugins/sarvam/stt.py` around
lines 130 - 133, The current branch logic routes `saaras:v3` incorrectly because
it only checks `model.startswith("saaras:v2.5")`; change the condition in the
function that returns endpoints to use `model.startswith("saaras")` so both
`saaras:v2.5` and `saaras:v3` return SARVAM_STT_TRANSLATE_BASE_URL and
SARVAM_STT_TRANSLATE_STREAMING_URL (these symbols identify the translate
endpoints), and update the else-branch comment (currently `# saarika models`) to
clarify it only covers saarika models.

Expand Down Expand Up @@ -151,6 +168,8 @@ def _build_websocket_url(base_url: str, opts: SarvamSTTOptions) -> str:
params["high_vad_sensitivity"] = str(opts.high_vad_sensitivity).lower()
if opts.flush_signal is not None:
params["flush_signal"] = str(opts.flush_signal).lower()
if opts.model == "saaras:v3":
params["mode"] = opts.mode
if opts.input_audio_codec:
params["input_audio_codec"] = opts.input_audio_codec

Expand All @@ -166,6 +185,7 @@ class STT(stt.STT):
Args:
language: BCP-47 language code, e.g., "hi-IN", "en-IN"
model: The Sarvam STT model to use
mode: Mode for saaras:v3 (transcribe/translate/verbatim/translit/codemix)
api_key: Sarvam.ai API key (falls back to SARVAM_API_KEY env var)
base_url: API endpoint URL
http_session: Optional aiohttp session to use
Expand All @@ -177,6 +197,7 @@ def __init__(
*,
language: str = "en-IN",
model: SarvamSTTModels | str = "saarika:v2.5",
mode: SarvamSTTModes | str = "transcribe",
api_key: str | None = None,
base_url: str | None = None,
http_session: aiohttp.ClientSession | None = None,
Expand Down Expand Up @@ -206,6 +227,7 @@ def __init__(
language=language,
api_key=self._api_key,
model=model,
mode=mode,
base_url=base_url,
prompt=prompt,
high_vad_sensitivity=high_vad_sensitivity,
Expand Down Expand Up @@ -236,6 +258,7 @@ async def _recognize_impl(
*,
language: NotGivenOr[str] = NOT_GIVEN,
model: NotGivenOr[SarvamSTTModels | str] = NOT_GIVEN,
mode: NotGivenOr[SarvamSTTModes | str] = NOT_GIVEN,
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> stt.SpeechEvent:
"""Recognize speech using Sarvam.ai API.
Expand All @@ -254,8 +277,11 @@ async def _recognize_impl(
APIStatusError: On API errors (non-200 status)
APITimeoutError: On API timeout
"""
opts_language = self._opts.language if isinstance(language, type(NOT_GIVEN)) else language
opts_model = self._opts.model if isinstance(model, type(NOT_GIVEN)) else model
opts_language = self._opts.language if not is_given(language) else language
opts_model = self._opts.model if not is_given(model) else model
opts_mode = self._opts.mode if not is_given(mode) else mode
if is_given(mode) and opts_model != "saaras:v3":
raise ValueError("mode is only supported when model is saaras:v3")

wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes()

Expand All @@ -269,6 +295,8 @@ async def _recognize_impl(
form_data.add_field("language_code", opts_language)
if opts_model:
form_data.add_field("model", str(opts_model))
if opts_model == "saaras:v3":
form_data.add_field("mode", str(opts_mode))

if not self._api_key:
raise ValueError("API key cannot be None")
Expand Down Expand Up @@ -351,6 +379,7 @@ def stream(
*,
language: NotGivenOr[str] = NOT_GIVEN,
model: NotGivenOr[SarvamSTTModels | str] = NOT_GIVEN,
mode: NotGivenOr[SarvamSTTModes | str] = NOT_GIVEN,
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
prompt: NotGivenOr[str] = NOT_GIVEN,
high_vad_sensitivity: NotGivenOr[bool] = NOT_GIVEN,
Expand All @@ -361,11 +390,16 @@ def stream(
"""Create a streaming transcription session."""
opts_language = language if is_given(language) else self._opts.language
opts_model = model if is_given(model) else self._opts.model
opts_mode = mode if is_given(mode) else self._opts.mode
if is_given(mode) and opts_model != "saaras:v3":
raise ValueError("mode is only supported when model is saaras:v3")

if not isinstance(opts_language, str):
opts_language = self._opts.language
if not isinstance(opts_model, str):
opts_model = self._opts.model
if not isinstance(opts_mode, str):
opts_mode = self._opts.mode

# Handle prompt conversion from NotGiven to None
final_prompt: str | None
Expand All @@ -390,6 +424,7 @@ def stream(
language=opts_language,
api_key=self._api_key if self._api_key else "",
model=opts_model,
mode=opts_mode,
prompt=final_prompt,
high_vad_sensitivity=opts_high_vad,
sample_rate=opts_sample_rate,
Expand Down Expand Up @@ -524,24 +559,50 @@ async def aclose(self) -> None:
# Clear reference to help with garbage collection
pass # Session reference will be cleared when object is destroyed

def update_options(self, *, language: str, model: str, prompt: str | None = None) -> None:
def update_options(
self,
*,
language: str,
model: str,
prompt: str | None = None,
mode: str | None = None,
) -> None:
"""Update streaming options."""
if not language or not language.strip():
raise ValueError("Language cannot be empty")
if not model or not model.strip():
raise ValueError("Model cannot be empty")

self._opts.language = language
self._opts.model = model
self._opts.base_url, self._opts.streaming_url = _get_urls_for_model(model)
if prompt is not None:
self._opts.prompt = prompt
if mode is not None and model != "saaras:v3":
raise ValueError("mode is only supported when model is saaras:v3")
if model == "saaras:v3":
allowed_modes: set[str] = {
"transcribe",
"translate",
"verbatim",
"translit",
"codemix",
}
if mode is not None and mode not in allowed_modes:
raise ValueError(
"mode must be one of transcribe, translate, verbatim, translit, codemix"
)
if mode is not None:
self._opts.mode = mode
else:
self._opts.mode = "transcribe"
self._logger.info(
"Options updated, triggering reconnection",
extra={
"session_id": self._session_id,
"language": language,
"model": model,
"prompt": prompt,
"mode": self._opts.mode,
},
)
self._reconnect_event.set()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
SARVAM_TTS_WS_URL = "wss://api.sarvam.ai/text-to-speech/ws"

# Sarvam TTS specific models and speakers
SarvamTTSModels = Literal["bulbul:v2"]
SarvamTTSModels = Literal["bulbul:v2", "bulbul:v3-beta"]

# Supported languages in BCP-47 format
SarvamTTSLanguages = Literal[
Expand All @@ -74,6 +74,34 @@
"abhilash",
"karun",
"hitesh",
# bulbul:v3-beta Customer Care
"shubh",
"ritu",
"rahul",
"pooja",
"simran",
"kavya",
"amit",
"ratan",
"rohan",
"dev",
"ishita",
"shreya",
"manan",
"sumit",
"priya",
# bulbul:v3-beta Content Creation
"aditya",
"kabir",
"neha",
"varun",
"roopa",
"aayan",
"ashutosh",
"advait",
# bulbul:v3-beta International
"amelia",
"sophia",
]

# Model-Speaker compatibility mapping
Expand All @@ -82,7 +110,65 @@
"female": ["anushka", "manisha", "vidya", "arya"],
"male": ["abhilash", "karun", "hitesh"],
"all": ["anushka", "manisha", "vidya", "arya", "abhilash", "karun", "hitesh"],
}
},
"bulbul:v3-beta": {
"female": [
"ritu",
"pooja",
"simran",
"kavya",
"ishita",
"shreya",
"priya",
"neha",
"roopa",
"amelia",
"sophia",
],
"male": [
"shubh",
"rahul",
"amit",
"ratan",
"rohan",
"dev",
"manan",
"sumit",
"aditya",
"kabir",
"varun",
"aayan",
"ashutosh",
"advait",
],
"all": [
"shubh",
"ritu",
"rahul",
"pooja",
"simran",
"kavya",
"amit",
"ratan",
"rohan",
"dev",
"ishita",
"shreya",
"manan",
"sumit",
"priya",
"aditya",
"kabir",
"neha",
"varun",
"roopa",
"aayan",
"ashutosh",
"advait",
"amelia",
"sophia",
],
},
}


Expand Down Expand Up @@ -313,8 +399,6 @@ def update_options(
if model is not None:
if not model.strip():
raise ValueError("Model cannot be empty")
if model not in ["bulbul:v2"]:
raise ValueError(f"Unsupported model: {model}")
self._opts.model = model

if speaker is not None:
Expand Down Expand Up @@ -393,9 +477,7 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
"target_language_code": self._opts.target_language_code,
"text": self._input_text,
"speaker": self._opts.speaker,
"pitch": self._opts.pitch,
"pace": self._opts.pace,
"loudness": self._opts.loudness,
"speech_sample_rate": self._opts.speech_sample_rate,
"enable_preprocessing": self._opts.enable_preprocessing,
"model": self._opts.model,
Expand Down