Skip to content

Commit 12b1638

Browse files
committed
Fix agent LLM triggering and turn detection
- Implement automatic LLM triggering in _on_transcript() for both modes: * Without turn detection: triggers immediately on transcript completion * With turn detection: accumulates transcripts and waits for TurnEndedEvent - Add _pending_user_transcripts dict to track multi-chunk transcripts per user - Implement turn detection LLM response in _on_turn_event() - Add TTS interruption when user starts speaking (barge-in) - Fix FAL turn detection event emission logic - Fix double TTS triggering in OpenAI LLM plugin (was emitting LLMResponseCompletedEvent twice) - Add FAL turn detection to simple agent example - Update example dependencies to use vision-agents naming Known limitation: LLM response generation is not yet cancelled when user interrupts. Only TTS audio playback stops, but LLM continues generating in background.
1 parent 589f6da commit 12b1638

File tree

6 files changed

+1867
-1852
lines changed

6 files changed

+1867
-1852
lines changed

agents-core/vision_agents/core/agents/agents.py

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,9 @@ def __init__(
129129
self.conversation: Optional[Conversation] = None
130130
self._user_conversation_handle: Optional[StreamHandle] = None
131131
self._agent_conversation_handle: Optional[StreamHandle] = None
132+
133+
# Track pending transcripts for turn-based response triggering
134+
self._pending_user_transcripts: Dict[str, str] = {}
132135

133136
# Merge plugin events BEFORE subscribing to any events
134137
for plugin in [stt, tts, turn_detection, vad, llm]:
@@ -664,15 +667,51 @@ async def _process_track(self, track_id: str, track_type: str, participant):
664667
async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None:
665668
"""Handle turn detection events."""
666669
if isinstance(event, TurnStartedEvent):
667-
# TODO: Implement TTS pause/resume functionality
668-
# For now, TTS will continue playing - this should be improved
669-
self.logger.info(
670-
f"👉 Turn started - participant speaking {event.speaker_id} : {event.confidence}"
671-
)
670+
# Interrupt TTS when user starts speaking (barge-in)
671+
if event.speaker_id and event.speaker_id != self.agent_user.id:
672+
if self.tts:
673+
self.logger.info(
674+
f"👉 Turn started - interrupting TTS for participant {event.speaker_id}"
675+
)
676+
try:
677+
await self.tts.stop_audio()
678+
except Exception as e:
679+
self.logger.error(f"Error stopping TTS: {e}")
680+
else:
681+
self.logger.info(
682+
f"👉 Turn started - participant speaking {event.speaker_id} : {event.confidence}"
683+
)
684+
else:
685+
# Agent itself started speaking - this is normal
686+
self.logger.debug(
687+
f"👉 Turn started - agent speaking {event.speaker_id}"
688+
)
672689
elif isinstance(event, TurnEndedEvent):
673690
self.logger.info(
674691
f"👉 Turn ended - participant {event.speaker_id} finished (duration: {event.duration}, confidence: {event.confidence})"
675692
)
693+
694+
# When turn detection is enabled, trigger LLM response when user's turn ends
695+
# This is the signal that the user has finished speaking and expects a response
696+
if event.speaker_id and event.speaker_id != self.agent_user.id:
697+
# Get the accumulated transcript for this speaker
698+
transcript = self._pending_user_transcripts.get(event.speaker_id, "")
699+
700+
if transcript and transcript.strip():
701+
self.logger.info(f"🤖 Triggering LLM response after turn ended for {event.speaker_id}")
702+
703+
# Create participant object if we have metadata
704+
participant = None
705+
if hasattr(event, 'custom') and event.custom:
706+
# Try to extract participant info from custom metadata
707+
participant = event.custom.get('participant')
708+
709+
# Trigger LLM response with the complete transcript
710+
if not self.realtime_mode and self.llm:
711+
await self.simple_response(transcript, participant)
712+
713+
# Clear the pending transcript for this speaker
714+
self._pending_user_transcripts[event.speaker_id] = ""
676715

677716
async def _on_partial_transcript(
678717
self, event: STTPartialTranscriptEvent | RealtimePartialTranscriptEvent
@@ -727,6 +766,33 @@ async def _on_transcript(self, event: STTTranscriptEvent | RealtimeTranscriptEve
727766
)
728767
self.conversation.complete_message(self._user_conversation_handle)
729768
self._user_conversation_handle = None
769+
770+
# Determine how to handle LLM triggering based on turn detection
771+
if self.turn_detection is not None:
772+
# With turn detection: accumulate transcripts and wait for TurnEndedEvent
773+
# Store/append the transcript for this user
774+
if user_id not in self._pending_user_transcripts:
775+
self._pending_user_transcripts[user_id] = event.text
776+
else:
777+
# Append to existing transcript (user might be speaking in chunks)
778+
self._pending_user_transcripts[user_id] += " " + event.text
779+
780+
self.logger.debug(
781+
f"📝 Accumulated transcript for {user_id} (waiting for turn end): "
782+
f"{self._pending_user_transcripts[user_id][:100]}..."
783+
)
784+
else:
785+
# Without turn detection: trigger LLM immediately on transcript completion
786+
# This is the traditional STT -> LLM flow
787+
if not self.realtime_mode and self.llm:
788+
self.logger.info(f"🤖 Triggering LLM response immediately (no turn detection)")
789+
790+
# Get participant from event metadata
791+
participant = None
792+
if hasattr(event, "user_metadata"):
793+
participant = event.user_metadata
794+
795+
await self.simple_response(event.text, participant)
730796

731797
async def _on_stt_error(self, error):
732798
"""Handle STT service errors."""

agents-core/vision_agents/core/turn_detection/fal_turn_detection.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -311,10 +311,13 @@ async def _process_turn_prediction(
311311
f"Turn completed detected for user {user_id} (confidence: {probability:.3f})"
312312
)
313313

314-
# If this user was speaking, emit turn ended
315-
if self._current_speaker == user_id:
316-
self._emit_turn_event(TurnEvent.TURN_ENDED, event_data)
317-
self._current_speaker = None
314+
# User finished speaking - emit turn ended
315+
# Set them as current speaker if they weren't already (in case we missed the start)
316+
if self._current_speaker != user_id:
317+
self._current_speaker = user_id
318+
319+
self._emit_turn_event(TurnEvent.TURN_ENDED, event_data)
320+
self._current_speaker = None
318321

319322
else:
320323
# Turn is still in progress

examples/01_simple_agent_example/pyproject.toml

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,31 +6,22 @@ requires-python = ">=3.13"
66
# put only what this example needs
77
dependencies = [
88
"python-dotenv>=1.0",
9-
"stream-agents-plugins-deepgram",
10-
"stream-agents-plugins-elevenlabs",
11-
"stream-agents-plugins-anthropic",
12-
"stream-agents-plugins-getstream",
9+
"vision-agents-plugins-deepgram",
10+
"vision-agents-plugins-elevenlabs",
11+
"vision-agents-plugins-anthropic",
12+
"vision-agents-plugins-getstream",
1313
"getstream-plugins-common",
14-
"stream-agents",
14+
"vision-agents",
1515
"openai>=1.101.0",
16-
"krisp-audio>=1.4.0; sys_platform == 'darwin' and platform_machine == 'aarch64'",
17-
"krisp-audio>=1.4.0; sys_platform == 'win32'",
18-
"krisp-audio>=1.4.0; sys_platform == 'linux' and platform_machine == 'x86_64'",
19-
"krisp-audio>=1.4.0; sys_platform == 'linux' and platform_machine == 'aarch64'",
2016
"anthropic>=0.66.0",
2117
"google-genai>=1.33.0",
18+
"fal-client>=0.5.3",
2219
]
2320

2421
[tool.uv.sources]
25-
krisp-audio = [
26-
{ path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-macosx_12_0_arm64.whl", marker = "sys_platform == 'darwin' and platform_machine == 'aarch64'" },
27-
{ path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64'" },
28-
{ path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64'" },
29-
{ path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-win_amd64.whl", marker = "sys_platform == 'win32'" }
30-
]
31-
"stream-agents-plugins-deepgram" = {path = "../../plugins/deepgram", editable=true}
32-
"stream-agents-plugins-elevenlabs" = {path = "../../plugins/elevenlabs", editable=true}
33-
"stream-agents-plugins-anthropic" = {path = "../../plugins/anthropic", editable=true}
34-
"stream-agents-plugins-getstream" = {path = "../../plugins/getstream", editable=true}
22+
"vision-agents-plugins-deepgram" = {path = "../../plugins/deepgram", editable=true}
23+
"vision-agents-plugins-elevenlabs" = {path = "../../plugins/elevenlabs", editable=true}
24+
"vision-agents-plugins-anthropic" = {path = "../../plugins/anthropic", editable=true}
25+
"vision-agents-plugins-getstream" = {path = "../../plugins/getstream", editable=true}
3526

36-
"stream-agents" = {path = "../../agents-core", editable=true}
27+
"vision-agents" = {path = "../../agents-core", editable=true}

examples/01_simple_agent_example/simple_agent_example.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from vision_agents.plugins import elevenlabs, deepgram, openai, getstream
99
from vision_agents.core import agents, cli
1010
from vision_agents.core.events import CallSessionParticipantJoinedEvent
11+
from vision_agents.core.turn_detection import FalTurnDetection
1112

1213
logging.basicConfig(
1314
level=logging.INFO,
@@ -37,6 +38,7 @@ async def start_agent() -> None:
3738
llm=llm,
3839
tts=elevenlabs.TTS(),
3940
stt=deepgram.STT(),
41+
turn_detection=FalTurnDetection(buffer_duration=2.0, confidence_threshold=0.5), # Enable turn detection with FAL
4042
#vad=silero.VAD(),
4143
# realtime version (vad, tts and stt not needed)
4244
# llm=openai.Realtime()

0 commit comments

Comments
 (0)