GetStream
diff --git a/‎plugins/moondream/tests/test_moondream.py‎
Lines changed: 7 additions & 7 deletions b/‎plugins/moondream/tests/test_moondream.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎plugins/moondream/vision_agents/plugins/moondream/__init__.py‎
Lines changed: 0 additions & 4 deletions b/‎plugins/moondream/vision_agents/plugins/moondream/__init__.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎plugins/moondream/vision_agents/plugins/moondream/detection/moondream_cloud_processor.py‎
Lines changed: 28 additions & 37 deletions b/‎plugins/moondream/vision_agents/plugins/moondream/detection/moondream_cloud_processor.py‎
Lines changed: 28 additions & 37 deletions
diff --git a/‎plugins/moondream/vision_agents/plugins/moondream/detection/moondream_local_processor.py‎
Lines changed: 28 additions & 36 deletions b/‎plugins/moondream/vision_agents/plugins/moondream/detection/moondream_local_processor.py‎
Lines changed: 28 additions & 36 deletions
@@ -21,8 +21,8 @@
 
 from vision_agents.plugins.moondream import (
     CloudDetectionProcessor,
-    MoondreamVideoTrack,
 )
+from vision_agents.core.utils.video_track import QueuedVideoTrack
 from vision_agents.plugins.moondream.moondream_utils import annotate_detections
 
 
@@ -48,7 +48,7 @@ def test_processor_initialization():
 @pytest.mark.asyncio
 async def test_video_track_frame_queuing(sample_frame):
     """Test that video track can queue and receive frames."""
-    track = MoondreamVideoTrack()
+    track = QueuedVideoTrack(fps=30, max_queue_size=5)
     await track.add_frame(sample_frame)
     received_frame = await track.recv()
     assert received_frame is not None
@@ -58,10 +58,10 @@ async def test_video_track_frame_queuing(sample_frame):
 
 
 def test_processor_publishes_track():
-    """Test that processor publishes a MoondreamVideoTrack."""
+    """Test that processor publishes a QueuedVideoTrack."""
     processor = CloudDetectionProcessor(api_key="test_key")
     track = processor.publish_video_track()
-    assert isinstance(track, MoondreamVideoTrack)
+    assert isinstance(track, QueuedVideoTrack)
     processor.close()
 
 
@@ -231,9 +231,9 @@ async def mock_inference(frame_array):
     # Process a frame
     await processor._process_and_add_frame(sample_frame)
 
-    # Verify results were stored
-    assert hasattr(processor, "_last_results")
-    assert "detections" in processor._last_results
+    # Verify cached results were stored
+    assert hasattr(processor, "_cached_results")
+    assert "detections" in processor._cached_results
     processor.close()
 
 
 
@@ -11,9 +11,6 @@
 from vision_agents.plugins.moondream.detection.moondream_local_processor import (
     LocalDetectionProcessor,
 )
-from vision_agents.plugins.moondream.detection.moondream_video_track import (
-    MoondreamVideoTrack,
-)
 from vision_agents.plugins.moondream.vlm.moondream_cloud_vlm import CloudVLM
 from vision_agents.plugins.moondream.vlm.moondream_local_vlm import LocalVLM
 
@@ -25,5 +22,4 @@
     "CloudVLM",
     "LocalVLM",
     "LocalDetectionProcessor",
-    "MoondreamVideoTrack",
 ]
@@ -19,10 +19,8 @@
     annotate_detections,
     parse_detection_bbox,
 )
-from vision_agents.plugins.moondream.detection.moondream_video_track import (
-    MoondreamVideoTrack,
-)
 from vision_agents.core.utils.video_forwarder import VideoForwarder
+from vision_agents.core.utils.video_track import QueuedVideoTrack
 import moondream as md
 
 
@@ -54,7 +52,7 @@ class CloudDetectionProcessor(
         detection_fps: Rate at which to send frames for detection (default: 5.0).
                       Lower values reduce API calls while maintaining smooth video.
         interval: Processing interval in seconds (default: 0)
-        max_workers: Number of worker threads for CPU-intensive operations (default: 10)
+        max_workers: Number of worker threads for CPU-intensive operations (default: 2)
     """
 
     name = "moondream_cloud"
@@ -66,7 +64,7 @@ def __init__(
         detect_objects: Union[str, List[str]] = "person",
         detection_fps: float = 5.0,
         interval: int = 0,
-        max_workers: int = 10,
+        max_workers: int = 2,
     ):
         super().__init__(interval=interval, receive_audio=False, receive_video=True)
 
@@ -76,14 +74,9 @@ def __init__(
         self.max_workers = max_workers
         self._shutdown = False
 
-        # Initialize state tracking attributes
-        self._last_results: Dict[str, Any] = {}
-        self._last_frame_time: Optional[float] = None
-        self._last_frame_pil: Optional[Image.Image] = None
-
-        # Async detection state
-        self._detection_in_progress = False
+        # Parallel detection state - track when results were requested to handle out-of-order completion
         self._last_detection_time: float = 0.0
+        self._last_result_time: float = 0.0
         self._cached_results: Dict[str, Any] = {"detections": []}
 
         # Font configuration constants for drawing efficiency
@@ -110,8 +103,8 @@ def __init__(
             max_workers=max_workers, thread_name_prefix="moondream_processor"
         )
 
-        # Video track for publishing (if used as video publisher)
-        self._video_track: MoondreamVideoTrack = MoondreamVideoTrack()
+        # Video track for publishing at 30 FPS with minimal buffering
+        self._video_track: QueuedVideoTrack = QueuedVideoTrack(fps=30, max_queue_size=5)
         self._video_forwarder: Optional[VideoForwarder] = None
 
         # Initialize model
@@ -145,10 +138,10 @@ async def process_video(
                 self._process_and_add_frame, name="moondream"
             )
         else:
-            # Create our own VideoForwarder at default FPS
+            # Create our own VideoForwarder at default FPS with minimal buffering
             self._video_forwarder = VideoForwarder(
                 incoming_track,  # type: ignore[arg-type]
-                max_buffer=30,
+                max_buffer=5,
                 name="moondream_forwarder",
             )
 
@@ -225,24 +218,18 @@ async def _process_and_add_frame(self, frame: av.VideoFrame):
             frame_array = frame.to_ndarray(format="rgb24")
             now = asyncio.get_event_loop().time()
 
-            # Check if we should start a new detection
+            # Check if we should start a new detection based on detection_fps
             detection_interval = (
                 1.0 / self.detection_fps if self.detection_fps > 0 else float("inf")
             )
-            should_detect = (
-                not self._detection_in_progress
-                and (now - self._last_detection_time) >= detection_interval
-            )
+            should_detect = (now - self._last_detection_time) >= detection_interval
 
             if should_detect:
-                # Start detection in background (don't await)
-                self._detection_in_progress = True
+                # Start detection in background (don't await) - runs in parallel
                 self._last_detection_time = now
-                asyncio.create_task(self._run_detection_background(frame_array.copy()))
-
-            # Always use cached results for annotation (don't wait for detection)
-            self._last_frame_time = now
-            self._last_frame_pil = Image.fromarray(frame_array)
+                asyncio.create_task(
+                    self._run_detection_background(frame_array.copy(), now)
+                )
 
             # Annotate frame with cached detections
             if self._cached_results.get("detections"):
@@ -265,19 +252,23 @@ async def _process_and_add_frame(self, frame: av.VideoFrame):
             # Pass through original frame on error
             await self._video_track.add_frame(frame)
 
-    async def _run_detection_background(self, frame_array: np.ndarray):
-        """Run detection in background and update cached results."""
+    async def _run_detection_background(
+        self, frame_array: np.ndarray, request_time: float
+    ):
+        """Run detection in background and update cached results if newer."""
         try:
             results = await self._run_inference(frame_array)
-            self._cached_results = results
-            self._last_results = results
-            logger.debug(
-                f"🔍 Detection complete: {len(results.get('detections', []))} objects"
-            )
+            # Only update cache if this result is newer than current cached result
+            if request_time > self._last_result_time:
+                self._cached_results = results
+                self._last_result_time = request_time
+                logger.debug(
+                    f"🔍 Detection complete: {len(results.get('detections', []))} objects"
+                )
+            else:
+                logger.debug("🔍 Detection complete but discarded (newer result exists)")
         except Exception as e:
             logger.warning(f"⚠️ Background detection failed: {e}")
-        finally:
-            self._detection_in_progress = False
 
     def close(self):
         """Clean up resources."""
 
@@ -24,9 +24,7 @@
     annotate_detections,
     handle_device,
 )
-from vision_agents.plugins.moondream.detection.moondream_video_track import (
-    MoondreamVideoTrack,
-)
+from vision_agents.core.utils.video_track import QueuedVideoTrack
 
 logger = logging.getLogger(__name__)
 
@@ -56,7 +54,7 @@ class LocalDetectionProcessor(
         detection_fps: Rate at which to run detection (default: 10.0).
                       Lower values reduce CPU/GPU load while maintaining smooth video.
         interval: Processing interval in seconds (default: 0)
-        max_workers: Number of worker threads for CPU-intensive operations (default: 10)
+        max_workers: Number of worker threads for CPU-intensive operations (default: 2)
         force_cpu: If True, force CPU usage even if CUDA/MPS is available (default: False).
                   Auto-detects CUDA, then MPS (Apple Silicon), then defaults to CPU. We recommend running on CUDA for best performance.
         model_name: Hugging Face model identifier (default: "moondream/moondream3-preview")
@@ -72,7 +70,7 @@ def __init__(
         detect_objects: Union[str, List[str]] = "person",
         detection_fps: float = 10.0,
         interval: int = 0,
-        max_workers: int = 10,
+        max_workers: int = 2,
         force_cpu: bool = False,
         model_name: str = "moondream/moondream3-preview",
         options: Optional[AgentOptions] = None,
@@ -94,13 +92,9 @@ def __init__(
         else:
             self._device, self._dtype = handle_device()
 
-        self._last_results: Dict[str, Any] = {}
-        self._last_frame_time: Optional[float] = None
-        self._last_frame_pil: Optional[Image.Image] = None
-
-        # Async detection state
-        self._detection_in_progress = False
+        # Parallel detection state - track when results were requested to handle out-of-order completion
         self._last_detection_time: float = 0.0
+        self._last_result_time: float = 0.0
         self._cached_results: Dict[str, Any] = {"detections": []}
 
         # Font configuration constants for drawing efficiency
@@ -122,8 +116,8 @@ def __init__(
             max_workers=max_workers, thread_name_prefix="moondream_local_processor"
         )
 
-        # Video track for publishing (if used as video publisher)
-        self._video_track: MoondreamVideoTrack = MoondreamVideoTrack()
+        # Video track for publishing at 30 FPS with minimal buffering
+        self._video_track: QueuedVideoTrack = QueuedVideoTrack(fps=30, max_queue_size=5)
         self._video_forwarder: Optional[VideoForwarder] = None
 
         # Model will be loaded in start() method
@@ -245,10 +239,10 @@ async def process_video(
                 self._process_and_add_frame, name="moondream_local"
             )
         else:
-            # Create our own VideoForwarder at default FPS
+            # Create our own VideoForwarder at default FPS with minimal buffering
             self._video_forwarder = VideoForwarder(
                 incoming_track,  # type: ignore[arg-type]
-                max_buffer=30,
+                max_buffer=5,
                 name="moondream_local_forwarder",
             )
 
@@ -317,24 +311,18 @@ async def _process_and_add_frame(self, frame: av.VideoFrame):
             frame_array = frame.to_ndarray(format="rgb24")
             now = asyncio.get_event_loop().time()
 
-            # Check if we should start a new detection
+            # Check if we should start a new detection based on detection_fps
             detection_interval = (
                 1.0 / self.detection_fps if self.detection_fps > 0 else float("inf")
             )
-            should_detect = (
-                not self._detection_in_progress
-                and (now - self._last_detection_time) >= detection_interval
-            )
+            should_detect = (now - self._last_detection_time) >= detection_interval
 
             if should_detect:
-                # Start detection in background (don't await)
-                self._detection_in_progress = True
+                # Start detection in background (don't await) - runs in parallel
                 self._last_detection_time = now
-                asyncio.create_task(self._run_detection_background(frame_array.copy()))
-
-            # Always use cached results for annotation (don't wait for detection)
-            self._last_frame_time = now
-            self._last_frame_pil = Image.fromarray(frame_array)
+                asyncio.create_task(
+                    self._run_detection_background(frame_array.copy(), now)
+                )
 
             # Annotate frame with cached detections
             if self._cached_results.get("detections"):
@@ -356,19 +344,23 @@ async def _process_and_add_frame(self, frame: av.VideoFrame):
             logger.exception(f"❌ Frame processing failed: {e}")
             await self._video_track.add_frame(frame)
 
-    async def _run_detection_background(self, frame_array: np.ndarray):
-        """Run detection in background and update cached results."""
+    async def _run_detection_background(
+        self, frame_array: np.ndarray, request_time: float
+    ):
+        """Run detection in background and update cached results if newer."""
         try:
             results = await self._run_inference(frame_array)
-            self._cached_results = results
-            self._last_results = results
-            logger.debug(
-                f"🔍 Detection complete: {len(results.get('detections', []))} objects"
-            )
+            # Only update cache if this result is newer than current cached result
+            if request_time > self._last_result_time:
+                self._cached_results = results
+                self._last_result_time = request_time
+                logger.debug(
+                    f"🔍 Detection complete: {len(results.get('detections', []))} objects"
+                )
+            else:
+                logger.debug("🔍 Detection complete but discarded (newer result exists)")
         except Exception as e:
             logger.warning(f"⚠️ Background detection failed: {e}")
-        finally:
-            self._detection_in_progress = False
 
     def close(self):
         """Clean up resources."""
Original file line number	Diff line number	Diff line change
`@@ -11,9 +11,6 @@`
`11`	`11`	`from vision_agents.plugins.moondream.detection.moondream_local_processor import (`
`12`	`12`	`LocalDetectionProcessor,`
`13`	`13`	`)`
`14`		`-from vision_agents.plugins.moondream.detection.moondream_video_track import (`
`15`		`- MoondreamVideoTrack,`
`16`		`-)`
`17`	`14`	`from vision_agents.plugins.moondream.vlm.moondream_cloud_vlm import CloudVLM`
`18`	`15`	`from vision_agents.plugins.moondream.vlm.moondream_local_vlm import LocalVLM`
`19`	`16`
`@@ -25,5 +22,4 @@`
`25`	`22`	`"CloudVLM",`
`26`	`23`	`"LocalVLM",`
`27`	`24`	`"LocalDetectionProcessor",`
`28`		`- "MoondreamVideoTrack",`
`29`	`25`	`]`