GetStream · tschellenbach · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/conftest.py b/conftest.py
@@ -17,6 +17,7 @@
 
 from getstream.video.rtc.track_util import PcmData, AudioFormat
 from vision_agents.core.stt.events import STTTranscriptEvent, STTErrorEvent, STTPartialTranscriptEvent
+from vision_agents.core.edge.types import Participant
 
 load_dotenv()
 
@@ -127,6 +128,12 @@ def assets_dir():
     return get_assets_dir()
 
 
+@pytest.fixture
+def participant():
+    """Create a test participant for STT testing."""
+    return Participant({}, user_id="test-user")
+
+
 @pytest.fixture
 def mia_audio_16khz():
     """Load mia.mp3 and convert to 16kHz PCM data."""

diff --git a/docs/ai/instructions/ai-utils.md b/docs/ai/instructions/ai-utils.md
@@ -27,3 +27,4 @@ PcmData.from_response
 * AudioForwarder to forward audio. See audio_forwarder.py
 * QueuedVideoTrack to have a writable video track
 * QueuedAudioTrack to have a writable audio track
+* AudioQueue enables you to buffer audio, and read a certain number of ms or number of samples of audio
diff --git a/plugins/elevenlabs/example/README.md b/plugins/elevenlabs/example/README.md
@@ -0,0 +1,137 @@
+# ElevenLabs TTS and STT Example
+
+This directory contains an example demonstrating how to use the ElevenLabs TTS and Scribe v2 STT plugins with Vision Agents.
+
+## Overview
+
+This example creates an AI agent that uses ElevenLabs' state-of-the-art voice technology for both speech synthesis and recognition.
+
+## Features
+
+- **ElevenLabs TTS**: High-quality, natural-sounding text-to-speech with customizable voices
+- **ElevenLabs Scribe v2**: Real-time speech-to-text with low latency (~150ms) and 99 language support
+- **GetStream**: Real-time communication infrastructure
+- **Smart Turn Detection**: Natural conversation flow management
+- **Gemini LLM**: Intelligent response generation
+
+## Setup
+
+1. Install dependencies:
+```bash
+cd plugins/elevenlabs/example
+uv sync
+```
+
+2. Create a `.env` file with your API keys:
+```bash
+# Required for ElevenLabs TTS and STT
+ELEVENLABS_API_KEY=your_elevenlabs_api_key
+
+# Required for GetStream (real-time communication)
+STREAM_API_KEY=your_stream_api_key
+STREAM_API_SECRET=your_stream_api_secret
+
+# Required for Gemini LLM
+GEMINI_API_KEY=your_gemini_api_key
+```
+
+## Running the Example
+
+```bash
+uv run elevenlabs_example.py
+```
+
+The agent will:
+1. Connect to the GetStream edge network
+2. Initialize ElevenLabs TTS and Scribe v2 STT
+3. Join a call and greet you
+4. Listen and respond to your voice input in real-time
+
+## Customization
+
+### Voice Selection
+
+You can customize the ElevenLabs voice:
+
+```python
+# Use a specific voice ID
+tts = elevenlabs.TTS(voice_id="your_voice_id")
+
+# Use a different model
+tts = elevenlabs.TTS(model_id="eleven_flash_v2_5")
+```
+
+### STT Configuration
+
+Customize the speech-to-text settings:
+
+```python
+# Use a different language
+stt = elevenlabs.STT(language_code="es")  # Spanish
+
+# Adjust VAD settings
+stt = elevenlabs.STT(
+    vad_threshold=0.5,
+    vad_silence_threshold_secs=2.0,
+)
+```
+
+### Turn Detection
+
+Adjust turn detection sensitivity:
+
+```python
+turn_detection = smart_turn.TurnDetection(
+    buffer_in_seconds=2.0,      # How long to wait for speech
+    confidence_threshold=0.5,    # How confident to be before ending turn
+)
+```
+
+## ElevenLabs Models
+
+### TTS Models
+- `eleven_multilingual_v2`: High-quality, emotionally rich (default)
+- `eleven_flash_v2_5`: Ultra-fast with low latency (~75ms)
+- `eleven_turbo_v2_5`: Balanced quality and speed
+
+### STT Model
+- `scribe_v2_realtime`: Real-time transcription with 99 language support
+
+## Architecture
+
+```
+User Voice Input
+    ↓
+ElevenLabs Scribe v2 STT (Real-time transcription)
+    ↓
+Gemini LLM (Generate response)
+    ↓
+ElevenLabs TTS (Synthesize speech)
+    ↓
+User Hears Response
+```
+
+## Additional Resources
+
+- [ElevenLabs Documentation](https://elevenlabs.io/docs)
+- [ElevenLabs Voice Library](https://elevenlabs.io/voice-library)
+- [Vision Agents Documentation](https://visionagents.ai)
+- [GetStream Documentation](https://getstream.io)
+
+## Troubleshooting
+
+### No audio output
+- Verify your `ELEVENLABS_API_KEY` is valid
+- Check your audio device settings
+- Ensure GetStream connection is established
+
+### Poor transcription quality
+- Use 16kHz sample rate audio for optimal results
+- Speak clearly and avoid background noise
+- Adjust `vad_threshold` if needed
+
+### High latency
+- Consider using `eleven_flash_v2_5` for TTS
+- Check your network connection
+- Reduce `buffer_in_seconds` in turn detection
+
diff --git a/plugins/elevenlabs/example/__init__.py b/plugins/elevenlabs/example/__init__.py
@@ -0,0 +1,2 @@
+# Example package
+
diff --git a/plugins/elevenlabs/example/assistant.md b/plugins/elevenlabs/example/assistant.md
@@ -0,0 +1,67 @@
+You're a friendly voice AI assistant. Here's your personality and style:
+
+## Communication Style
+
+- Be warm, approachable, and helpful
+- Keep responses concise and conversational
+- Use natural language without being overly formal
+- Show enthusiasm when appropriate
+
+## Response Guidelines
+
+### Helpfulness
+- Always aim to provide clear, actionable information
+- If you don't know something, admit it honestly
+- Offer to help with follow-up questions
+
+### Tone
+- Friendly but professional
+- Patient and understanding
+- Encouraging and positive
+
+### Conversation Flow
+- Listen actively to what the user says
+- Ask clarifying questions when needed
+- Stay on topic unless the user changes direction
+- Remember context from earlier in the conversation
+
+## Example Phrases
+
+**Greetings:**
+- "Hello! How can I help you today?"
+- "Hi there! What can I do for you?"
+- "Good to hear from you! What's on your mind?"
+
+**Acknowledgment:**
+- "I understand, let me help with that."
+- "That's a great question!"
+- "I see what you mean."
+
+**Clarification:**
+- "Just to make sure I understand, you're asking about..."
+- "Could you tell me a bit more about..."
+- "To clarify, you want to..."
+
+**Assistance:**
+- "Here's what I can help you with..."
+- "Let me walk you through this..."
+- "I'd be happy to explain..."
+
+**Uncertainty:**
+- "I'm not entirely sure about that, but..."
+- "That's outside my expertise, but I can try to help..."
+- "Let me think about the best way to answer that..."
+
+**Closing:**
+- "Is there anything else I can help you with?"
+- "Let me know if you need anything else!"
+- "Feel free to ask if you have more questions."
+
+## Usage Notes
+
+- Keep responses under 2-3 sentences when possible
+- Use contractions (I'm, you're, let's) for natural speech
+- Avoid jargon unless the user uses it first
+- Match the user's energy level (casual vs. professional)
+- Be empathetic to the user's needs and emotions
+
diff --git a/plugins/elevenlabs/example/elevenlabs_example.py b/plugins/elevenlabs/example/elevenlabs_example.py
@@ -0,0 +1,68 @@
+"""
+ElevenLabs TTS and STT Example
+
+This example demonstrates ElevenLabs TTS and Scribe v2 STT integration with Vision Agents.
+
+This example creates an agent that uses:
+- ElevenLabs for text-to-speech (TTS)
+- ElevenLabs Scribe v2 for speech-to-text (STT)
+- GetStream for edge/real-time communication
+- Smart Turn for turn detection
+
+Requirements:
+- ELEVENLABS_API_KEY environment variable
+- STREAM_API_KEY and STREAM_API_SECRET environment variables
+"""
+
+import asyncio
+import logging
+
+from dotenv import load_dotenv
+
+from vision_agents.core import User, Agent, cli
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import elevenlabs, getstream, smart_turn, gemini
+
+
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+
+async def create_agent(**kwargs) -> Agent:
+    """Create the agent with ElevenLabs TTS and STT."""
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="Friendly AI", id="agent"),
+        instructions="You're a friendly voice AI assistant. Keep your replies conversational and concise. Read @assistant.md for personality guidelines",
+        tts=elevenlabs.TTS(),  # Uses ElevenLabs for text-to-speech
+        stt=elevenlabs.STT(),  # Uses ElevenLabs Scribe v2 for speech-to-text
+        llm=gemini.LLM("gemini-2.5-flash-lite"),
+        turn_detection=smart_turn.TurnDetection(),
+    )
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    """Join the call and start the agent."""
+    # Ensure the agent user is created
+    await agent.create_user()
+    # Create a call
+    call = await agent.create_call(call_type, call_id)
+
+    logger.info("🤖 Starting ElevenLabs Agent...")
+
+    # Have the agent join the call/room
+    with await agent.join(call):
+        logger.info("Joining call")
+        logger.info("LLM ready")
+
+        await asyncio.sleep(5)
+        await agent.llm.simple_response(text="Hello! How can I help you today?")
+
+        await agent.finish()  # Run till the call ends
+
+
+if __name__ == "__main__":
+    cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
+
diff --git a/plugins/elevenlabs/example/pyproject.toml b/plugins/elevenlabs/example/pyproject.toml
@@ -0,0 +1,21 @@
+[project]
+name = "elevenlabs-example"
+version = "0.0.0"
+requires-python = ">=3.10"
+
+dependencies = [
+  "python-dotenv>=1.0",
+  "vision-agents-plugins-elevenlabs",
+  "vision-agents-plugins-getstream",
+  "vision-agents-plugins-smart-turn",
+  "vision-agents-plugins-gemini",
+  "vision-agents",
+]
+
+[tool.uv.sources]
+"vision-agents-plugins-elevenlabs" = {path = "..", editable=true}
+"vision-agents-plugins-getstream" = {path = "../../getstream", editable=true}
+"vision-agents-plugins-smart-turn" = {path = "../../smart_turn", editable=true}
+"vision-agents-plugins-gemini" = {path = "../../gemini", editable=true}
+"vision-agents" = {path = "../../../agents-core", editable=true}
+
diff --git a/plugins/elevenlabs/pyproject.toml b/plugins/elevenlabs/pyproject.toml
@@ -5,14 +5,14 @@ build-backend = "hatchling.build"
 [project]
 name = "vision-agents-plugins-elevenlabs"
 dynamic = ["version"]
-description = "ElevenLabs TTS integration for Vision Agents"
+description = "ElevenLabs TTS and STT integration for Vision Agents"
 readme = "README.md"
-keywords = ["elevenlabs", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
+keywords = ["elevenlabs", "TTS", "text-to-speech", "STT", "speech-to-text", "AI", "voice agents", "agents"]
 requires-python = ">=3.10"
 license = "MIT"
 dependencies = [
     "vision-agents",
-    "elevenlabs>=2.5.0",
+    "elevenlabs>=2.22.1",
 ]
 
 [project.urls]