Fix agent LLM triggering and turn detection

d3xvn · d3xvn · commit 12b16380d00c · 2025-10-07T10:50:19.000+02:00
- Implement automatic LLM triggering in _on_transcript() for both modes:
  * Without turn detection: triggers immediately on transcript completion
  * With turn detection: accumulates transcripts and waits for TurnEndedEvent
- Add _pending_user_transcripts dict to track multi-chunk transcripts per user
- Implement turn detection LLM response in _on_turn_event()
- Add TTS interruption when user starts speaking (barge-in)
- Fix FAL turn detection event emission logic
- Fix double TTS triggering in OpenAI LLM plugin (was emitting LLMResponseCompletedEvent twice)
- Add FAL turn detection to simple agent example
- Update example dependencies to use vision-agents naming

Known limitation: LLM response generation is not yet cancelled when user interrupts.
Only TTS audio playback stops, but LLM continues generating in background.
diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
@@ -129,6 +129,9 @@ def __init__(
         self.conversation: Optional[Conversation] = None
         self._user_conversation_handle: Optional[StreamHandle] = None
         self._agent_conversation_handle: Optional[StreamHandle] = None
+        
+        # Track pending transcripts for turn-based response triggering
+        self._pending_user_transcripts: Dict[str, str] = {}
 
         # Merge plugin events BEFORE subscribing to any events
         for plugin in [stt, tts, turn_detection, vad, llm]:
@@ -664,15 +667,51 @@ async def _process_track(self, track_id: str, track_type: str, participant):
     async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None:
         """Handle turn detection events."""
         if isinstance(event, TurnStartedEvent):
-            # TODO: Implement TTS pause/resume functionality
-            # For now, TTS will continue playing - this should be improved
-            self.logger.info(
-                f"👉 Turn started - participant speaking {event.speaker_id} : {event.confidence}"
-            )
+            # Interrupt TTS when user starts speaking (barge-in)
+            if event.speaker_id and event.speaker_id != self.agent_user.id:
+                if self.tts:
+                    self.logger.info(
+                        f"👉 Turn started - interrupting TTS for participant {event.speaker_id}"
+                    )
+                    try:
+                        await self.tts.stop_audio()
+                    except Exception as e:
+                        self.logger.error(f"Error stopping TTS: {e}")
+                else:
+                    self.logger.info(
+                        f"👉 Turn started - participant speaking {event.speaker_id} : {event.confidence}"
+                    )
+            else:
+                # Agent itself started speaking - this is normal
+                self.logger.debug(
+                    f"👉 Turn started - agent speaking {event.speaker_id}"
+                )
         elif isinstance(event, TurnEndedEvent):
             self.logger.info(
                 f"👉 Turn ended - participant {event.speaker_id} finished (duration: {event.duration}, confidence: {event.confidence})"
             )
+            
+            # When turn detection is enabled, trigger LLM response when user's turn ends
+            # This is the signal that the user has finished speaking and expects a response
+            if event.speaker_id and event.speaker_id != self.agent_user.id:
+                # Get the accumulated transcript for this speaker
+                transcript = self._pending_user_transcripts.get(event.speaker_id, "")
+                
+                if transcript and transcript.strip():
+                    self.logger.info(f"🤖 Triggering LLM response after turn ended for {event.speaker_id}")
+                    
+                    # Create participant object if we have metadata
+                    participant = None
+                    if hasattr(event, 'custom') and event.custom:
+                        # Try to extract participant info from custom metadata
+                        participant = event.custom.get('participant')
+                    
+                    # Trigger LLM response with the complete transcript
+                    if not self.realtime_mode and self.llm:
+                        await self.simple_response(transcript, participant)
+                    
+                    # Clear the pending transcript for this speaker
+                    self._pending_user_transcripts[event.speaker_id] = ""
 
     async def _on_partial_transcript(
         self, event: STTPartialTranscriptEvent | RealtimePartialTranscriptEvent
@@ -727,6 +766,33 @@ async def _on_transcript(self, event: STTTranscriptEvent | RealtimeTranscriptEve
             )
             self.conversation.complete_message(self._user_conversation_handle)
             self._user_conversation_handle = None
+        
+        # Determine how to handle LLM triggering based on turn detection
+        if self.turn_detection is not None:
+            # With turn detection: accumulate transcripts and wait for TurnEndedEvent
+            # Store/append the transcript for this user
+            if user_id not in self._pending_user_transcripts:
+                self._pending_user_transcripts[user_id] = event.text
+            else:
+                # Append to existing transcript (user might be speaking in chunks)
+                self._pending_user_transcripts[user_id] += " " + event.text
+            
+            self.logger.debug(
+                f"📝 Accumulated transcript for {user_id} (waiting for turn end): "
+                f"{self._pending_user_transcripts[user_id][:100]}..."
+            )
+        else:
+            # Without turn detection: trigger LLM immediately on transcript completion
+            # This is the traditional STT -> LLM flow
+            if not self.realtime_mode and self.llm:
+                self.logger.info(f"🤖 Triggering LLM response immediately (no turn detection)")
+                
+                # Get participant from event metadata
+                participant = None
+                if hasattr(event, "user_metadata"):
+                    participant = event.user_metadata
+                
+                await self.simple_response(event.text, participant)
 
     async def _on_stt_error(self, error):
         """Handle STT service errors."""
diff --git a/agents-core/vision_agents/core/turn_detection/fal_turn_detection.py b/agents-core/vision_agents/core/turn_detection/fal_turn_detection.py
@@ -311,10 +311,13 @@ async def _process_turn_prediction(
                     f"Turn completed detected for user {user_id} (confidence: {probability:.3f})"
                 )
 
-                # If this user was speaking, emit turn ended
-                if self._current_speaker == user_id:
-                    self._emit_turn_event(TurnEvent.TURN_ENDED, event_data)
-                    self._current_speaker = None
+                # User finished speaking - emit turn ended
+                # Set them as current speaker if they weren't already (in case we missed the start)
+                if self._current_speaker != user_id:
+                    self._current_speaker = user_id
+                    
+                self._emit_turn_event(TurnEvent.TURN_ENDED, event_data)
+                self._current_speaker = None
 
             else:
                 # Turn is still in progress
diff --git a/examples/01_simple_agent_example/pyproject.toml b/examples/01_simple_agent_example/pyproject.toml
@@ -6,31 +6,22 @@ requires-python = ">=3.13"
 # put only what this example needs
 dependencies = [
   "python-dotenv>=1.0",
-  "stream-agents-plugins-deepgram",
-  "stream-agents-plugins-elevenlabs",
-  "stream-agents-plugins-anthropic",
-  "stream-agents-plugins-getstream",
+  "vision-agents-plugins-deepgram",
+  "vision-agents-plugins-elevenlabs",
+  "vision-agents-plugins-anthropic",
+  "vision-agents-plugins-getstream",
   "getstream-plugins-common",
-  "stream-agents",
+  "vision-agents",
   "openai>=1.101.0",
-  "krisp-audio>=1.4.0; sys_platform == 'darwin' and platform_machine == 'aarch64'",
-  "krisp-audio>=1.4.0; sys_platform == 'win32'",
-  "krisp-audio>=1.4.0; sys_platform == 'linux' and platform_machine == 'x86_64'",
-  "krisp-audio>=1.4.0; sys_platform == 'linux' and platform_machine == 'aarch64'",
   "anthropic>=0.66.0",
   "google-genai>=1.33.0",
+  "fal-client>=0.5.3",
 ]
 
 [tool.uv.sources]
-krisp-audio = [
-  { path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-macosx_12_0_arm64.whl", marker = "sys_platform == 'darwin' and platform_machine == 'aarch64'" },
-  { path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64'" },
-  { path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64'" },
-  { path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-win_amd64.whl", marker = "sys_platform == 'win32'" }
-]
-"stream-agents-plugins-deepgram" = {path = "../../plugins/deepgram", editable=true}
-"stream-agents-plugins-elevenlabs" = {path = "../../plugins/elevenlabs", editable=true}
-"stream-agents-plugins-anthropic" = {path = "../../plugins/anthropic", editable=true}
-"stream-agents-plugins-getstream" = {path = "../../plugins/getstream", editable=true}
+"vision-agents-plugins-deepgram" = {path = "../../plugins/deepgram", editable=true}
+"vision-agents-plugins-elevenlabs" = {path = "../../plugins/elevenlabs", editable=true}
+"vision-agents-plugins-anthropic" = {path = "../../plugins/anthropic", editable=true}
+"vision-agents-plugins-getstream" = {path = "../../plugins/getstream", editable=true}
 
-"stream-agents" = {path = "../../agents-core", editable=true}
+"vision-agents" = {path = "../../agents-core", editable=true}
diff --git a/examples/01_simple_agent_example/simple_agent_example.py b/examples/01_simple_agent_example/simple_agent_example.py
@@ -8,6 +8,7 @@
 from vision_agents.plugins import elevenlabs, deepgram, openai, getstream
 from vision_agents.core import agents, cli
 from vision_agents.core.events import CallSessionParticipantJoinedEvent
+from vision_agents.core.turn_detection import FalTurnDetection
 
 logging.basicConfig(
     level=logging.INFO,
@@ -37,6 +38,7 @@ async def start_agent() -> None:
         llm=llm,
         tts=elevenlabs.TTS(),
         stt=deepgram.STT(),
+        turn_detection=FalTurnDetection(buffer_duration=2.0, confidence_threshold=0.5),  # Enable turn detection with FAL
         #vad=silero.VAD(),
         # realtime version (vad, tts and stt not needed)
         # llm=openai.Realtime()
diff --git a/examples/01_simple_agent_example/uv.lock b/examples/01_simple_agent_example/uv.lock
diff --git a/plugins/openai/vision_agents/plugins/openai/openai_llm.py b/plugins/openai/vision_agents/plugins/openai/openai_llm.py