Skip to content

Commit f68dc9c

Browse files
committed
fix: decouple detection from video display for smooth FPS
- Add detection_fps parameter to control detection rate independently - Run detection asynchronously in background tasks - Cache detection results and overlay on frames without waiting - CloudDetectionProcessor defaults to 5.0 detection FPS - LocalDetectionProcessor defaults to 10.0 detection FPS - Video frames now pass through at full FPS regardless of detection speed
1 parent 2bd801f commit f68dc9c

File tree

2 files changed

+91
-12
lines changed

2 files changed

+91
-12
lines changed

plugins/moondream/vision_agents/plugins/moondream/detection/moondream_cloud_processor.py

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ class CloudDetectionProcessor(
4141
which can be increased by contacting the Moondream team. If you are deploying
4242
to your own infrastructure, consider using LocalDetectionProcessor instead.
4343
44+
Detection runs asynchronously in the background while frames pass through at
45+
full FPS. The last known detection results are overlaid on each frame.
46+
4447
Args:
4548
api_key: API key for Moondream Cloud API. If not provided, will attempt to read
4649
from MOONDREAM_API_KEY environment variable.
@@ -49,6 +52,8 @@ class CloudDetectionProcessor(
4952
so any object string works. Examples: "person", "car",
5053
"basketball", ["person", "car", "dog"]. Default: "person"
5154
fps: Frame processing rate (default: 30)
55+
detection_fps: Rate at which to send frames for detection (default: 2).
56+
Lower values reduce API calls while maintaining smooth video.
5257
interval: Processing interval in seconds (default: 0)
5358
max_workers: Number of worker threads for CPU-intensive operations (default: 10)
5459
"""
@@ -61,6 +66,7 @@ def __init__(
6166
conf_threshold: float = 0.3,
6267
detect_objects: Union[str, List[str]] = "person",
6368
fps: int = 30,
69+
detection_fps: float = 5.0,
6470
interval: int = 0,
6571
max_workers: int = 10,
6672
):
@@ -69,6 +75,7 @@ def __init__(
6975
self.api_key = api_key or os.getenv("MOONDREAM_API_KEY")
7076
self.conf_threshold = conf_threshold
7177
self.fps = fps
78+
self.detection_fps = detection_fps
7279
self.max_workers = max_workers
7380
self._shutdown = False
7481

@@ -77,6 +84,11 @@ def __init__(
7784
self._last_frame_time: Optional[float] = None
7885
self._last_frame_pil: Optional[Image.Image] = None
7986

87+
# Async detection state
88+
self._detection_in_progress = False
89+
self._last_detection_time: float = 0.0
90+
self._cached_results: Dict[str, Any] = {"detections": []}
91+
8092
# Font configuration constants for drawing efficiency
8193
self._font = cv2.FONT_HERSHEY_SIMPLEX
8294
self._font_scale = 0.5
@@ -110,6 +122,7 @@ def __init__(
110122

111123
logger.info("🌙 Moondream Processor initialized")
112124
logger.info(f"🎯 Detection configured for objects: {self.detect_objects}")
125+
logger.info(f"📹 Video FPS: {fps}, Detection FPS: {detection_fps}")
113126

114127
async def process_video(
115128
self,
@@ -213,28 +226,41 @@ def _run_detection_sync(self, frame_array: np.ndarray) -> List[Dict]:
213226
return all_detections
214227

215228
async def _process_and_add_frame(self, frame: av.VideoFrame):
229+
"""Process frame: pass through immediately, run detection asynchronously."""
216230
try:
217231
frame_array = frame.to_ndarray(format="rgb24")
232+
now = asyncio.get_event_loop().time()
218233

219-
results = await self._run_inference(frame_array)
234+
# Check if we should start a new detection
235+
detection_interval = 1.0 / self.detection_fps if self.detection_fps > 0 else float("inf")
236+
should_detect = (
237+
not self._detection_in_progress
238+
and (now - self._last_detection_time) >= detection_interval
239+
)
220240

221-
self._last_results = results
222-
self._last_frame_time = asyncio.get_event_loop().time()
241+
if should_detect:
242+
# Start detection in background (don't await)
243+
self._detection_in_progress = True
244+
self._last_detection_time = now
245+
asyncio.create_task(self._run_detection_background(frame_array.copy()))
246+
247+
# Always use cached results for annotation (don't wait for detection)
248+
self._last_frame_time = now
223249
self._last_frame_pil = Image.fromarray(frame_array)
224250

225-
# Annotate frame with detections
226-
if results.get("detections"):
251+
# Annotate frame with cached detections
252+
if self._cached_results.get("detections"):
227253
frame_array = annotate_detections(
228254
frame_array,
229-
results,
255+
self._cached_results,
230256
font=self._font,
231257
font_scale=self._font_scale,
232258
font_thickness=self._font_thickness,
233259
bbox_color=self._bbox_color,
234260
text_color=self._text_color,
235261
)
236262

237-
# Convert back to av.VideoFrame and publish
263+
# Convert back to av.VideoFrame and publish immediately
238264
processed_frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
239265
await self._video_track.add_frame(processed_frame)
240266

@@ -243,6 +269,18 @@ async def _process_and_add_frame(self, frame: av.VideoFrame):
243269
# Pass through original frame on error
244270
await self._video_track.add_frame(frame)
245271

272+
async def _run_detection_background(self, frame_array: np.ndarray):
273+
"""Run detection in background and update cached results."""
274+
try:
275+
results = await self._run_inference(frame_array)
276+
self._cached_results = results
277+
self._last_results = results
278+
logger.debug(f"🔍 Detection complete: {len(results.get('detections', []))} objects")
279+
except Exception as e:
280+
logger.warning(f"⚠️ Background detection failed: {e}")
281+
finally:
282+
self._detection_in_progress = False
283+
246284
def close(self):
247285
"""Clean up resources."""
248286
self._shutdown = True

plugins/moondream/vision_agents/plugins/moondream/detection/moondream_local_processor.py

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ class LocalDetectionProcessor(
3939
This processor downloads and runs the moondream3-preview model locally from Hugging Face,
4040
providing the same functionality as the cloud API version without requiring an API key.
4141
42+
Detection runs asynchronously in the background while frames pass through at
43+
full FPS. The last known detection results are overlaid on each frame.
44+
4245
Note: The moondream3-preview model is gated and requires authentication:
4346
- Request access at https://huggingface.co/moondream/moondream3-preview
4447
- Once approved, authenticate using one of:
@@ -51,6 +54,8 @@ class LocalDetectionProcessor(
5154
so any object string works. Examples: "person", "car",
5255
"basketball", ["person", "car", "dog"]. Default: "person"
5356
fps: Frame processing rate (default: 30)
57+
detection_fps: Rate at which to run detection (default: 10.0).
58+
Lower values reduce CPU/GPU load while maintaining smooth video.
5459
interval: Processing interval in seconds (default: 0)
5560
max_workers: Number of worker threads for CPU-intensive operations (default: 10)
5661
force_cpu: If True, force CPU usage even if CUDA/MPS is available (default: False).
@@ -67,6 +72,7 @@ def __init__(
6772
conf_threshold: float = 0.3,
6873
detect_objects: Union[str, List[str]] = "person",
6974
fps: int = 30,
75+
detection_fps: float = 10.0,
7076
interval: int = 0,
7177
max_workers: int = 10,
7278
force_cpu: bool = False,
@@ -82,6 +88,7 @@ def __init__(
8288
self.model_name = model_name
8389
self.conf_threshold = conf_threshold
8490
self.fps = fps
91+
self.detection_fps = detection_fps
8592
self.max_workers = max_workers
8693
self._shutdown = False
8794

@@ -94,6 +101,11 @@ def __init__(
94101
self._last_frame_time: Optional[float] = None
95102
self._last_frame_pil: Optional[Image.Image] = None
96103

104+
# Async detection state
105+
self._detection_in_progress = False
106+
self._last_detection_time: float = 0.0
107+
self._cached_results: Dict[str, Any] = {"detections": []}
108+
97109
# Font configuration constants for drawing efficiency
98110
self._font = cv2.FONT_HERSHEY_SIMPLEX
99111
self._font_scale = 0.5
@@ -123,6 +135,7 @@ def __init__(
123135
logger.info("🌙 Moondream Local Processor initialized")
124136
logger.info(f"🎯 Detection configured for objects: {self.detect_objects}")
125137
logger.info(f"🔧 Device: {self.device}")
138+
logger.info(f"📹 Video FPS: {fps}, Detection FPS: {detection_fps}")
126139

127140
@property
128141
def device(self) -> str:
@@ -303,32 +316,60 @@ def _run_detection_sync(self, image: Image.Image) -> List[Dict]:
303316
return all_detections
304317

305318
async def _process_and_add_frame(self, frame: av.VideoFrame):
319+
"""Process frame: pass through immediately, run detection asynchronously."""
306320
try:
307321
frame_array = frame.to_ndarray(format="rgb24")
308-
results = await self._run_inference(frame_array)
322+
now = asyncio.get_event_loop().time()
309323

310-
self._last_results = results
311-
self._last_frame_time = asyncio.get_event_loop().time()
324+
# Check if we should start a new detection
325+
detection_interval = 1.0 / self.detection_fps if self.detection_fps > 0 else float("inf")
326+
should_detect = (
327+
not self._detection_in_progress
328+
and (now - self._last_detection_time) >= detection_interval
329+
)
330+
331+
if should_detect:
332+
# Start detection in background (don't await)
333+
self._detection_in_progress = True
334+
self._last_detection_time = now
335+
asyncio.create_task(self._run_detection_background(frame_array.copy()))
336+
337+
# Always use cached results for annotation (don't wait for detection)
338+
self._last_frame_time = now
312339
self._last_frame_pil = Image.fromarray(frame_array)
313340

314-
if results.get("detections"):
341+
# Annotate frame with cached detections
342+
if self._cached_results.get("detections"):
315343
frame_array = annotate_detections(
316344
frame_array,
317-
results,
345+
self._cached_results,
318346
font=self._font,
319347
font_scale=self._font_scale,
320348
font_thickness=self._font_thickness,
321349
bbox_color=self._bbox_color,
322350
text_color=self._text_color,
323351
)
324352

353+
# Convert back to av.VideoFrame and publish immediately
325354
processed_frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
326355
await self._video_track.add_frame(processed_frame)
327356

328357
except Exception as e:
329358
logger.exception(f"❌ Frame processing failed: {e}")
330359
await self._video_track.add_frame(frame)
331360

361+
async def _run_detection_background(self, frame_array: np.ndarray):
362+
"""Run detection in background and update cached results."""
363+
try:
364+
results = await self._run_inference(frame_array)
365+
self._cached_results = results
366+
self._last_results = results
367+
logger.debug(f"🔍 Detection complete: {len(results.get('detections', []))} objects")
368+
except Exception as e:
369+
logger.warning(f"⚠️ Background detection failed: {e}")
370+
finally:
371+
self._detection_in_progress = False
372+
332373
def close(self):
333374
"""Clean up resources."""
334375
self._shutdown = True

0 commit comments

Comments
 (0)