diff --git a/inference/core/workflows/core_steps/loader.py b/inference/core/workflows/core_steps/loader.py index 630c6d088..ee54a7f15 100644 --- a/inference/core/workflows/core_steps/loader.py +++ b/inference/core/workflows/core_steps/loader.py @@ -201,6 +201,9 @@ from inference.core.workflows.core_steps.transformations.relative_static_crop.v1 import ( RelativeStaticCropBlockV1, ) +from inference.core.workflows.core_steps.transformations.stabilize_detections.v1 import ( + StabilizeTrackedDetectionsBlockV1, +) from inference.core.workflows.core_steps.transformations.stitch_images.v1 import ( StitchImagesBlockV1, ) @@ -308,93 +311,94 @@ def load_blocks() -> List[Type[WorkflowBlock]]: return [ - TimeInZoneBlockV1, - BoundingRectBlockV1, - SegmentAnything2BlockV1, - DetectionsConsensusBlockV1, - ClipComparisonBlockV1, - LMMBlockV1, - LMMForClassificationBlockV1, - OpenAIBlockV1, - CogVLMBlockV1, - OCRModelBlockV1, - YoloWorldModelBlockV1, - RoboflowInstanceSegmentationModelBlockV1, - RoboflowKeypointDetectionModelBlockV1, - RoboflowClassificationModelBlockV1, - RoboflowMultiLabelClassificationModelBlockV1, - RoboflowObjectDetectionModelBlockV1, - BarcodeDetectorBlockV1, - QRCodeDetectorBlockV1, AbsoluteStaticCropBlockV1, - DynamicCropBlockV1, - DetectionsFilterBlockV1, - DetectionOffsetBlockV1, - ByteTrackerBlockV1, - RelativeStaticCropBlockV1, - DetectionsTransformationBlockV1, - RoboflowDatasetUploadBlockV1, - ContinueIfBlockV1, - PerspectiveCorrectionBlockV1, - DynamicZonesBlockV1, - SizeMeasurementBlockV1, - DetectionsClassesReplacementBlockV1, - ExpressionBlockV1, - PropertyDefinitionBlockV1, - DimensionCollapseBlockV1, - FirstNonEmptyOrDefaultBlockV1, + AntropicClaudeBlockV1, BackgroundColorVisualizationBlockV1, + BarcodeDetectorBlockV1, BlurVisualizationBlockV1, BoundingBoxVisualizationBlockV1, + BoundingRectBlockV1, + ByteTrackerBlockV1, + ByteTrackerBlockV2, + CameraFocusBlockV1, CircleVisualizationBlockV1, + ClipComparisonBlockV1, + ClipComparisonBlockV2, + CogVLMBlockV1, ColorVisualizationBlockV1, + ContinueIfBlockV1, + ConvertGrayscaleBlockV1, CornerVisualizationBlockV1, CropVisualizationBlockV1, + DetectionOffsetBlockV1, + DetectionsClassesReplacementBlockV1, + DetectionsConsensusBlockV1, + DetectionsFilterBlockV1, + DetectionsStitchBlockV1, + DetectionsTransformationBlockV1, + DimensionCollapseBlockV1, + DistanceMeasurementBlockV1, + DominantColorBlockV1, DotVisualizationBlockV1, + DynamicCropBlockV1, + DynamicZonesBlockV1, EllipseVisualizationBlockV1, + ExpressionBlockV1, + FirstNonEmptyOrDefaultBlockV1, + Florence2BlockV1, + GoogleGeminiBlockV1, + GoogleVisionOCRBlockV1, HaloVisualizationBlockV1, + ImageBlurBlockV1, + ImageContoursDetectionBlockV1, + ImagePreprocessingBlockV1, + ImageSlicerBlockV1, + ImageThresholdBlockV1, + JSONParserBlockV1, + LMMBlockV1, + LMMForClassificationBlockV1, LabelVisualizationBlockV1, + LineCounterBlockV1, + LineCounterBlockV2, + LineCounterZoneVisualizationBlockV1, MaskVisualizationBlockV1, + ModelComparisonVisualizationBlockV1, + OCRModelBlockV1, + OpenAIBlockV1, + OpenAIBlockV2, + PathDeviationAnalyticsBlockV1, + PathDeviationAnalyticsBlockV2, + PerspectiveCorrectionBlockV1, PixelateVisualizationBlockV1, + PixelationCountBlockV1, PolygonVisualizationBlockV1, - LineCounterZoneVisualizationBlockV1, - ModelComparisonVisualizationBlockV1, - TriangleVisualizationBlockV1, + PolygonZoneVisualizationBlockV1, + PropertyDefinitionBlockV1, + QRCodeDetectorBlockV1, + RelativeStaticCropBlockV1, + RoboflowClassificationModelBlockV1, RoboflowCustomMetadataBlockV1, - DetectionsStitchBlockV1, - ImageSlicerBlockV1, - DominantColorBlockV1, - PixelationCountBlockV1, + RoboflowDatasetUploadBlockV1, + RoboflowDatasetUploadBlockV2, + RoboflowInstanceSegmentationModelBlockV1, + RoboflowKeypointDetectionModelBlockV1, + RoboflowMultiLabelClassificationModelBlockV1, + RoboflowObjectDetectionModelBlockV1, + SIFTBlockV1, SIFTComparisonBlockV1, SIFTComparisonBlockV2, - SIFTBlockV1, - TemplateMatchingBlockV1, - ImageBlurBlockV1, - ConvertGrayscaleBlockV1, - ImageThresholdBlockV1, - ImageContoursDetectionBlockV1, - ClipComparisonBlockV2, - CameraFocusBlockV1, - RoboflowDatasetUploadBlockV2, + SegmentAnything2BlockV1, + SizeMeasurementBlockV1, + StabilityAIInpaintingBlockV1, + StabilizeTrackedDetectionsBlockV1, StitchImagesBlockV1, - OpenAIBlockV2, - JSONParserBlockV1, + TemplateMatchingBlockV1, + TimeInZoneBlockV1, + TimeInZoneBlockV2, + TriangleVisualizationBlockV1, VLMAsClassifierBlockV1, - GoogleGeminiBlockV1, - GoogleVisionOCRBlockV1, VLMAsDetectorBlockV1, - AntropicClaudeBlockV1, - LineCounterBlockV1, - PolygonZoneVisualizationBlockV1, - Florence2BlockV1, - DistanceMeasurementBlockV1, - StabilityAIInpaintingBlockV1, - ImagePreprocessingBlockV1, - PathDeviationAnalyticsBlockV1, - ByteTrackerBlockV2, - PathDeviationAnalyticsBlockV2, - TimeInZoneBlockV2, - LineCounterBlockV2, + YoloWorldModelBlockV1, ] diff --git a/inference/core/workflows/core_steps/transformations/stabilize_detections/__init__.py b/inference/core/workflows/core_steps/transformations/stabilize_detections/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/inference/core/workflows/core_steps/transformations/stabilize_detections/v1.py b/inference/core/workflows/core_steps/transformations/stabilize_detections/v1.py new file mode 100644 index 000000000..5a96b27be --- /dev/null +++ b/inference/core/workflows/core_steps/transformations/stabilize_detections/v1.py @@ -0,0 +1,260 @@ +from collections import deque +from typing import Deque, Dict, List, Literal, Optional, Set, Tuple, Type, Union + +import numpy as np +import supervision as sv +from pydantic import ConfigDict, Field + +from inference.core.workflows.execution_engine.entities.base import ( + OutputDefinition, + WorkflowImageData, +) +from inference.core.workflows.execution_engine.entities.types import ( + FLOAT_ZERO_TO_ONE_KIND, + INSTANCE_SEGMENTATION_PREDICTION_KIND, + INTEGER_KIND, + OBJECT_DETECTION_PREDICTION_KIND, + StepOutputSelector, + WorkflowImageSelector, + WorkflowParameterSelector, +) +from inference.core.workflows.prototypes.block import ( + BlockResult, + WorkflowBlock, + WorkflowBlockManifest, +) + +OUTPUT_KEY: str = "tracked_detections" +LONG_DESCRIPTION = """ +This block stores last known position for each bounding box +If box disappears then this block will bring it back so short gaps are filled with last known box position +The block requires detections to be tracked (i.e. each object must have unique tracker_id assigned, +which persists between frames) +WARNING: this block will produce many short-lived bounding boxes for unstable trackers! +""" + + +class BlockManifest(WorkflowBlockManifest): + model_config = ConfigDict( + json_schema_extra={ + "name": "Detections Stabilizer", + "version": "v1", + "short_description": "Apply smoothing algorithm to reduce noise and flickering across video frames", + "long_description": LONG_DESCRIPTION, + "license": "Apache-2.0", + "block_type": "transformation", + } + ) + type: Literal["roboflow_core/stabilize_detections@v1"] + image: WorkflowImageSelector + detections: StepOutputSelector( + kind=[ + OBJECT_DETECTION_PREDICTION_KIND, + INSTANCE_SEGMENTATION_PREDICTION_KIND, + ] + ) = Field( # type: ignore + description="Tracked detections", + examples=["$steps.object_detection_model.predictions"], + ) + smoothing_window_size: Union[Optional[int], WorkflowParameterSelector(kind=[INTEGER_KIND])] = Field( # type: ignore + default=3, + description="Predicted movement of detection will be smoothed based on historical measurements of velocity," + " this parameter controls number of historical measurements taken under account when calculating smoothed velocity." + " Detections will be removed from generating smoothed predictions if they had been missing for longer than this number of frames.", + examples=[5, "$inputs.smoothing_window_size"], + ) + bbox_smoothing_coefficient: Union[Optional[float], WorkflowParameterSelector(kind=[FLOAT_ZERO_TO_ONE_KIND])] = Field( # type: ignore + default=0.2, + description="Bounding box smoothing coefficient applied when given tracker_id is present on current frame." + " This parameter must be initialized with value between 0 and 1", + examples=[0.2, "$inputs.bbox_smoothing_coefficient"], + ) + + @classmethod + def describe_outputs(cls) -> List[OutputDefinition]: + return [ + OutputDefinition( + name=OUTPUT_KEY, + kind=[ + OBJECT_DETECTION_PREDICTION_KIND, + INSTANCE_SEGMENTATION_PREDICTION_KIND, + ], + ), + ] + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + +class StabilizeTrackedDetectionsBlockV1(WorkflowBlock): + def __init__(self): + self._batch_of_last_known_detections: Dict[ + str, Dict[Union[int, str], sv.Detections] + ] = {} + self._batch_of_kalman_filters: Dict[Union[int, str], VelocityKalmanFilter] = {} + + @classmethod + def get_manifest(cls) -> Type[WorkflowBlockManifest]: + return BlockManifest + + def run( + self, + image: WorkflowImageData, + detections: sv.Detections, + smoothing_window_size: int, + bbox_smoothing_coefficient: float, + ) -> BlockResult: + metadata = image.video_metadata + if detections.tracker_id is None: + raise ValueError( + f"tracker_id not initialized, {self.__class__.__name__} requires detections to be tracked" + ) + cached_detections = self._batch_of_last_known_detections.setdefault( + metadata.video_identifier, {} + ) + kalman_filter = self._batch_of_kalman_filters.setdefault( + metadata.video_identifier, + VelocityKalmanFilter(smoothing_window_size=smoothing_window_size), + ) + measured_velocities = {} + for i, (tracker_id, xyxy) in enumerate( + zip(detections.tracker_id, detections.xyxy) + ): + if tracker_id not in cached_detections: + continue + x1, y1, x2, y2 = xyxy + this_frame_center_xy = [x1 + abs(x2 - x1), y1 + abs(y2 - y1)] + x1, y1, x2, y2 = cached_detections[tracker_id].xyxy[0] + prev_frame_center_xy = [x1 + abs(x2 - x1), y1 + abs(y2 - y1)] + measured_velocities[tracker_id] = ( + this_frame_center_xy[0] - prev_frame_center_xy[0], + this_frame_center_xy[1] - prev_frame_center_xy[1], + ) + predicted_velocities = kalman_filter.update(measurements=measured_velocities) + + predicted_detections = {} + for i, tracker_id in enumerate(detections.tracker_id): + if tracker_id in cached_detections: + prev_frame_detection = cached_detections[tracker_id] + prev_frame_xyxy = prev_frame_detection.xyxy[0] + curr_frame_detection = detections[i] + curr_frame_xyxy = curr_frame_detection.xyxy[0] + curr_frame_detection.xyxy[0] = smooth_xyxy( + prev_xyxy=prev_frame_xyxy, + curr_xyxy=curr_frame_xyxy, + alpha=bbox_smoothing_coefficient, + ) + predicted_detections[tracker_id] = curr_frame_detection + else: + predicted_detections[tracker_id] = detections[i] + cached_detections[tracker_id] = detections[i] + for tracker_id, predicted_velocity in predicted_velocities.items(): + if tracker_id in predicted_detections: + continue + prev_frame_detection = cached_detections[tracker_id] + prev_frame_xyxy = prev_frame_detection.xyxy[0] + curr_frame_xyxy = np.array( + [ + prev_frame_detection.xyxy[0] + + np.array([predicted_velocity, predicted_velocity]).flatten() + ] + ) + prev_frame_detection.xyxy = smooth_xyxy( + prev_xyxy=prev_frame_xyxy, + curr_xyxy=curr_frame_xyxy, + alpha=bbox_smoothing_coefficient, + ) + predicted_detections[tracker_id] = prev_frame_detection + for tracker_id in list(cached_detections.keys()): + if ( + tracker_id not in kalman_filter.tracked_vectors + and tracker_id not in predicted_detections + ): + del cached_detections[tracker_id] + merged_detections = sv.Detections.merge(predicted_detections.values()) + if len(merged_detections) == 0: + merged_detections.tracker_id = np.array([]) + return {OUTPUT_KEY: merged_detections} + + +def smooth_xyxy(prev_xyxy: np.ndarray, curr_xyxy: np.ndarray, alpha=0.2) -> np.ndarray: + smoothed_xyxy = alpha * curr_xyxy + (1 - alpha) * prev_xyxy + + return smoothed_xyxy + + +class VelocityKalmanFilter: + def __init__(self, smoothing_window_size: int): + self.time_step = 1 + self.smoothing_window_size = smoothing_window_size + self.state_transition_matrix = np.array([[1, 0], [0, 1]]) + self.process_noise_covariance = np.eye(2) * 0.001 + self.measurement_noise_covariance = np.eye(2) * 0.01 + self.tracked_vectors: Dict[ + Union[int, str], + Dict[ + Literal["velocity", "error_covariance", "history"], + Union[np.ndarray, Deque[float, float]], + ], + ] = {} + + def predict(self) -> Dict[Union[int, str], np.ndarray]: + predictions: Dict[Union[int, str], np.ndarray] = {} + for tracker_id, data in self.tracked_vectors.items(): + data["velocity"] = np.dot(self.state_transition_matrix, data["velocity"]) + data["error_covariance"] = ( + np.dot( + np.dot(self.state_transition_matrix, data["error_covariance"]), + self.state_transition_matrix.T, + ) + + self.process_noise_covariance + ) + predictions[tracker_id] = data["velocity"] + return predictions + + def update( + self, measurements: Dict[Union[int, str], Tuple[float, float]] + ) -> Dict[Union[int, str], np.ndarray]: + updated_vector_ids: Set[Union[int, str]] = set() + for tracker_id, velocity in measurements.items(): + updated_vector_ids.add(tracker_id) + if tracker_id in self.tracked_vectors: + measurement = np.array(velocity).reshape(2, 1) + tracked_vector = self.tracked_vectors[tracker_id] + tracked_vector["history"].appendleft(measurement) + smoothed_measurement = np.mean(tracked_vector["history"], axis=0) + measurement_residual = smoothed_measurement - tracked_vector["velocity"] + residual_covariance = ( + tracked_vector["error_covariance"] + + self.measurement_noise_covariance + ) + kalman_gain = np.dot( + tracked_vector["error_covariance"], + np.linalg.inv(residual_covariance), + ) + tracked_vector["velocity"] = tracked_vector["velocity"] + np.dot( + kalman_gain, measurement_residual + ) + tracked_vector["error_covariance"] = tracked_vector[ + "error_covariance" + ] - np.dot(kalman_gain, tracked_vector["error_covariance"]) + else: + self.tracked_vectors[tracker_id] = { + "velocity": np.array([[velocity[0]], [velocity[1]]]), + "error_covariance": np.eye(2), + "history": deque( + [np.array([[velocity[0]], [velocity[1]]])], + maxlen=self.smoothing_window_size, + ), + } + + predicted_velocities = self.predict() + + for tracker_id in set(self.tracked_vectors.keys()) - updated_vector_ids: + if self.tracked_vectors[tracker_id]["history"]: + self.tracked_vectors[tracker_id]["history"].popleft() + if not self.tracked_vectors[tracker_id]["history"]: + del self.tracked_vectors[tracker_id] + + return predicted_velocities diff --git a/tests/workflows/unit_tests/core_steps/transformations/test_stabilize_detections.py b/tests/workflows/unit_tests/core_steps/transformations/test_stabilize_detections.py new file mode 100644 index 000000000..86c01c6d2 --- /dev/null +++ b/tests/workflows/unit_tests/core_steps/transformations/test_stabilize_detections.py @@ -0,0 +1,207 @@ +import numpy as np +import supervision as sv + +from inference.core.workflows.core_steps.transformations.stabilize_detections.v1 import ( + BlockManifest, + StabilizeTrackedDetectionsBlockV1, + VelocityKalmanFilter, +) +from inference.core.workflows.execution_engine.entities.base import ( + ImageParentMetadata, + VideoMetadata, + WorkflowImageData, +) + + +def test_stabilize_detections_validation_when_valid_manifest_is_given(): + # given + data = { + "type": "roboflow_core/stabilize_detections@v1", + "name": "some", + "detections": "$steps.other.dets", + "image": "$inputs.image", + "smoothing_window_size": 5, + "bbox_smoothing_coefficient": 0.5, + } + + # when + result = BlockManifest.model_validate(data) + + # then + assert result == BlockManifest( + type="roboflow_core/stabilize_detections@v1", + name="some", + detections="$steps.other.dets", + image="$inputs.image", + smoothing_window_size=5, + bbox_smoothing_coefficient=0.5, + ) + + +def test_velocity_kalman_filter_stable_detections(): + # given + kalman_filter = VelocityKalmanFilter(smoothing_window_size=3) + frame_1_vel = { + 1: (10, 0), + 2: (0, 15), + 3: (15, 15), + 4: (-7, 0), + 5: (0, -7), + 6: (-7, -7), + } + frame_2_vel = { + 1: (15, 0), + 2: (0, 15), + 3: (15, 15), + 4: (-7, 0), + 5: (0, -7), + 6: (-7, -7), + } + frame_3_vel = { + 1: (20, 0), + 2: (0, 20), + 3: (20, 20), + 4: (-5, 0), + 5: (0, -5), + 6: (-5, -5), + } + kalman_filter.update(measurements=frame_1_vel) + kalman_filter.update(measurements=frame_2_vel) + kalman_filter.update(measurements=frame_3_vel) + + # when + predictions_1 = kalman_filter.update(measurements={}) + predictions_2 = kalman_filter.update(measurements={}) + predictions_3 = kalman_filter.update(measurements={}) + predictions_4 = kalman_filter.update(measurements={}) + + # then + assert ( + len(predictions_1) == 6 + ), "6 predictions expected to be produced for first frame when predictions are missing and smoothing window contains 3 elements" + assert ( + len(predictions_2) == 6 + ), "6 predictions expected to be produced for second frame when predictions are missing and smoothing window contains 3 elements" + assert ( + len(predictions_3) == 6 + ), "6 predictions expected to be produced for third frame when predictions are missing and smoothing window contains 3 elements" + assert ( + len(predictions_4) == 0 + ), "No predictions expected to be produced for fourth and subsequent frames" + assert ( + len(kalman_filter.tracked_vectors) == 0 + ), "No tracked versions expected in kalman filter" + + +def test_stabilize_detections(): + # when + frame_1_dets = sv.Detections( + xyxy=np.array( + [ + [10, 10, 100, 100], + [110, 110, 200, 200], + ] + ), + tracker_id=np.array([1, 2]), + ) + frame_2_dets = sv.Detections( + xyxy=np.array( + [ + [30, 10, 120, 100], + [110, 130, 200, 220], + ] + ), + tracker_id=np.array([1, 2]), + ) + frame_3_dets = sv.Detections( + xyxy=np.array( + [ + [60, 10, 150, 100], + [110, 160, 200, 250], + ] + ), + tracker_id=np.array([1, 2]), + ) + frame_4_dets = sv.Detections( + xyxy=np.array( + [ + [90, 10, 180, 100], + [110, 190, 200, 280], + ] + ), + tracker_id=np.array([1, 2]), + ) + empty_dets = sv.Detections.merge([]) + empty_dets.tracker_id = np.array([]) + + block = StabilizeTrackedDetectionsBlockV1() + parent_img_metadata = ImageParentMetadata("") + video_metadata = VideoMetadata( + video_identifier="1", + frame_number=1, + frame_timestamp=0, + ) + img = WorkflowImageData( + parent_metadata=parent_img_metadata, + video_metadata=video_metadata, + numpy_image=np.zeros((10, 10, 3)), + ) + + # then + res_1 = block.run( + image=img, + detections=frame_1_dets, + smoothing_window_size=3, + bbox_smoothing_coefficient=0.3, + ) + res_2 = block.run( + image=img, + detections=frame_2_dets, + smoothing_window_size=3, + bbox_smoothing_coefficient=0.3, + ) + res_3 = block.run( + image=img, + detections=frame_3_dets, + smoothing_window_size=3, + bbox_smoothing_coefficient=0.3, + ) + res_4 = block.run( + image=img, + detections=frame_4_dets, + smoothing_window_size=3, + bbox_smoothing_coefficient=0.3, + ) + res_5 = block.run( + image=img, + detections=empty_dets, + smoothing_window_size=3, + bbox_smoothing_coefficient=0.3, + ) + res_6 = block.run( + image=img, + detections=empty_dets, + smoothing_window_size=3, + bbox_smoothing_coefficient=0.3, + ) + res_7 = block.run( + image=img, + detections=empty_dets, + smoothing_window_size=3, + bbox_smoothing_coefficient=0.3, + ) + res_8 = block.run( + image=img, + detections=empty_dets, + smoothing_window_size=3, + bbox_smoothing_coefficient=0.3, + ) + + assert len(res_1["tracked_detections"]) == 2 + assert len(res_2["tracked_detections"]) == 2 + assert len(res_3["tracked_detections"]) == 2 + assert len(res_4["tracked_detections"]) == 2 + assert len(res_5["tracked_detections"]) == 2 + assert len(res_6["tracked_detections"]) == 2 + assert len(res_7["tracked_detections"]) == 2 + assert len(res_8["tracked_detections"]) == 0