From 3e638da63571d391ef36c5906b1532555c1ca7db Mon Sep 17 00:00:00 2001
From: Anthony Tafoya <87080582+Anthony-Tafoya@users.noreply.github.com>
Date: Tue, 17 Sep 2024 08:32:09 -0700
Subject: [PATCH] Adding Random Seed for Frame Processing (#3416)

* Adding Random Seed for Frame Processing

* Added Unit Tests

* Updating Unit Tests for Ffmpeg

* Make Logs More Detailed

---------

Co-authored-by: Anthony-Tafoya <anthonytafoya@berkeley.edu>
Co-authored-by: J.Y. <132313008+jb-ye@users.noreply.github.com>
---
 nerfstudio/process_data/process_data_utils.py |  16 ++-
 .../video_to_nerfstudio_dataset.py            |   9 +-
 tests/process_data/test_misc.py               | 114 +++++++++++++++++-
 3 files changed, 133 insertions(+), 6 deletions(-)

diff --git a/nerfstudio/process_data/process_data_utils.py b/nerfstudio/process_data/process_data_utils.py
index b5b2391a09..3c9013abe3 100644
--- a/nerfstudio/process_data/process_data_utils.py
+++ b/nerfstudio/process_data/process_data_utils.py
@@ -15,6 +15,7 @@
 """Helper utils for processing data into the nerfstudio format."""
 
 import math
+import random
 import re
 import shutil
 import sys
@@ -126,6 +127,7 @@ def convert_video_to_images(
     verbose: bool = False,
     image_prefix: str = "frame_",
     keep_image_dir: bool = False,
+    random_seed: Optional[int] = None,
 ) -> Tuple[List[str], int]:
     """Converts a video into a sequence of images.
 
@@ -138,6 +140,7 @@ def convert_video_to_images(
         verbose: If True, logs the output of the command.
         image_prefix: Prefix to use for the image filenames.
         keep_image_dir: If True, don't delete the output directory if it already exists.
+        random_seed: If set, the seed used to choose the frames of the video
     Returns:
         A tuple containing summary of the conversion and the number of extracted frames.
     """
@@ -178,8 +181,6 @@ def convert_video_to_images(
             start_y = crop_factor[0]
             crop_cmd = f"crop=w=iw*{width}:h=ih*{height}:x=iw*{start_x}:y=ih*{start_y},"
 
-        spacing = num_frames // num_frames_target
-
         downscale_chains = [f"[t{i}]scale=iw/{2**i}:ih/{2**i}[out{i}]" for i in range(num_downscales + 1)]
         downscale_dirs = [Path(str(image_dir) + (f"_{2**i}" if i > 0 else "")) for i in range(num_downscales + 1)]
         downscale_paths = [downscale_dirs[i] / f"{image_prefix}%05d.png" for i in range(num_downscales + 1)]
@@ -196,8 +197,15 @@ def convert_video_to_images(
 
         ffmpeg_cmd += " -vsync vfr"
 
-        if spacing > 1:
-            CONSOLE.print("Number of frames to extract:", math.ceil(num_frames / spacing))
+        # Evenly distribute frame selection if random seed does not exist
+        spacing = num_frames // num_frames_target
+        if random_seed:
+            random.seed(random_seed)
+            frame_indices = sorted(random.sample(range(num_frames), num_frames_target))
+            select_cmd = "select='" + "+".join([f"eq(n\,{idx})" for idx in frame_indices]) + "',setpts=N/TB,"
+            CONSOLE.print(f"Extracting {num_frames_target} frames using seed {random_seed} random selection.")
+        elif spacing > 1:
+            CONSOLE.print(f"Extracting {math.ceil(num_frames / spacing)} frames in evenly spaced intervals")
             select_cmd = f"thumbnail={spacing},setpts=N/TB,"
         else:
             CONSOLE.print("[bold red]Can't satisfy requested number of frames. Extracting all frames.")
diff --git a/nerfstudio/process_data/video_to_nerfstudio_dataset.py b/nerfstudio/process_data/video_to_nerfstudio_dataset.py
index af17e7d6b6..51a8a0b761 100644
--- a/nerfstudio/process_data/video_to_nerfstudio_dataset.py
+++ b/nerfstudio/process_data/video_to_nerfstudio_dataset.py
@@ -16,7 +16,7 @@
 
 import shutil
 from dataclasses import dataclass
-from typing import Literal
+from typing import Literal, Optional
 
 from nerfstudio.process_data import equirect_utils, process_data_utils
 from nerfstudio.process_data.colmap_converter_to_nerfstudio_dataset import ColmapConverterToNerfstudioDataset
@@ -41,6 +41,10 @@ class VideoToNerfstudioDataset(ColmapConverterToNerfstudioDataset):
     """Feature matching method to use. Vocab tree is recommended for a balance of speed
     and accuracy. Exhaustive is slower but more accurate. Sequential is faster but
     should only be used for videos."""
+    random_seed: Optional[int] = None
+    """Random seed to select video frames for training set"""
+    eval_random_seed: Optional[int] = None
+    """Random seed to select video frames for eval set"""
 
     def main(self) -> None:
         """Process video into a nerfstudio dataset."""
@@ -59,6 +63,7 @@ def main(self) -> None:
                 num_downscales=0,
                 crop_factor=(0.0, 0.0, 0.0, 0.0),
                 verbose=self.verbose,
+                random_seed=self.random_seed,
             )
         else:
             # If we're not dealing with equirects we can downscale in one step.
@@ -71,6 +76,7 @@ def main(self) -> None:
                 verbose=self.verbose,
                 image_prefix="frame_train_" if self.eval_data is not None else "frame_",
                 keep_image_dir=False,
+                random_seed=self.random_seed,
             )
             if self.eval_data is not None:
                 summary_log_eval, num_extracted_frames_eval = process_data_utils.convert_video_to_images(
@@ -82,6 +88,7 @@ def main(self) -> None:
                     verbose=self.verbose,
                     image_prefix="frame_eval_",
                     keep_image_dir=True,
+                    random_seed=self.eval_random_seed,
                 )
                 summary_log += summary_log_eval
                 num_extracted_frames += num_extracted_frames_eval
diff --git a/tests/process_data/test_misc.py b/tests/process_data/test_misc.py
index 1b2404b517..23fc3453ca 100644
--- a/tests/process_data/test_misc.py
+++ b/tests/process_data/test_misc.py
@@ -2,13 +2,21 @@
 Test misc data utils
 """
 
+import os
+import re
+from pathlib import Path
+from unittest import mock
+
+import cv2
 import numpy as np
+from PIL import Image
 from pyquaternion import Quaternion
 from scipy.spatial.transform import Rotation
 
 # TODO(1480) use pycolmap instead of colmap_parsing_utils
 # import pycolmap
 from nerfstudio.data.utils.colmap_parsing_utils import qvec2rotmat
+from nerfstudio.process_data.process_data_utils import convert_video_to_images
 
 
 def test_scalar_first_scalar_last_quaternions():
@@ -39,7 +47,7 @@ def test_scalar_first_scalar_last_quaternions():
 
     # Expected Rotation matrix
     # fmt: off
-    R_expected = np.array( 
+    R_expected = np.array(
         [
             [ 0.81379768, -0.44096961,  0.37852231],
             [ 0.46984631,  0.88256412,  0.01802831],
@@ -61,3 +69,107 @@ def test_scalar_first_scalar_last_quaternions():
     # R = pycolmap.qvec_to_rotmat(wxyz)
     R = qvec2rotmat(wxyz)
     assert np.allclose(R, R_expected)
+
+
+def test_process_video_conversion_with_seed(tmp_path: Path):
+    """
+    Test convert_video_to_images by creating a mock video and ensuring correct frame extraction with seed.
+    """
+
+    # Inner functions needed for the unit tests
+    def create_mock_video(video_path: Path, frame_dir: Path, num_frames=10, frame_rate=1):
+        """Creates a mock video from a series of frames using OpenCV."""
+
+        first_frame = cv2.imread(str(frame_dir / "frame_0.png"))
+        height, width, _ = first_frame.shape
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        out = cv2.VideoWriter(str(video_path), fourcc, frame_rate, (width, height))
+
+        for i in range(num_frames):
+            frame_path = frame_dir / f"frame_{i}.png"
+            frame = cv2.imread(str(frame_path))
+            out.write(frame)
+        out.release()
+
+    def extract_frame_numbers(ffmpeg_command: str):
+        """Extracts the frame numbers from the ffmpeg command"""
+
+        pattern = r"eq\(n\\,(\d+)\)"
+        matches = re.findall(pattern, ffmpeg_command)
+        frame_numbers = [int(match) for match in matches]
+        return frame_numbers
+
+    # Create a video directory with path video
+    video_dir = tmp_path / "video"
+    video_dir.mkdir(exist_ok=True)
+
+    # Set parameters for mock video
+    video_path = video_dir / "mock_video.mp4"
+    num_frames = 10
+    frame_height = 150
+    frame_width = 100
+    frame_rate = 1
+
+    # Create the mock video
+    for i in range(num_frames):
+        img = Image.new("RGB", (frame_width, frame_height), (0, 0, 0))
+        img.save(video_dir / f"frame_{i}.png")
+    create_mock_video(video_path, video_dir, num_frames=num_frames, frame_rate=frame_rate)
+
+    # Call convert_video_to_images
+    image_output_dir = tmp_path / "extracted_images"
+    num_frames_target = 5
+    num_downscales = 1
+    crop_factor = (0.0, 0.0, 0.0, 0.0)
+
+    # Mock missing COLMAP and ffmpeg in the dev env
+    old_path = os.environ.get("PATH", "")
+    os.environ["PATH"] = str(tmp_path / "mocked_bin") + f":{old_path}"
+    (tmp_path / "mocked_bin").mkdir()
+    (tmp_path / "mocked_bin" / "colmap").touch(mode=0o777)
+    (tmp_path / "mocked_bin" / "ffmpeg").touch(mode=0o777)
+
+    # Return value of 10 for the get_num_frames_in_video run_command call
+    with mock.patch("nerfstudio.process_data.process_data_utils.run_command", return_value="10") as mock_run_func:
+        summary_log, extracted_frame_count = convert_video_to_images(
+            video_path=video_path,
+            image_dir=image_output_dir,
+            num_frames_target=num_frames_target,
+            num_downscales=num_downscales,
+            crop_factor=crop_factor,
+            verbose=False,
+            random_seed=42,
+        )
+        assert mock_run_func.call_count == 2, f"Expected 2 calls, but got {mock_run_func.call_count}"
+        first_frames = extract_frame_numbers(mock_run_func.call_args[0][0])
+        assert len(first_frames) == 5, f"Expected 5 frames, but got {len(first_frames)}"
+
+        summary_log, extracted_frame_count = convert_video_to_images(
+            video_path=video_path,
+            image_dir=image_output_dir,
+            num_frames_target=num_frames_target,
+            num_downscales=num_downscales,
+            crop_factor=crop_factor,
+            verbose=False,
+            random_seed=42,
+        )
+
+        assert mock_run_func.call_count == 4, f"Expected 4 total calls, but got {mock_run_func.call_count}"
+        second_frames = extract_frame_numbers(mock_run_func.call_args[0][0])
+        assert len(second_frames) == 5, f"Expected 5 frames, but got {len(first_frames)}"
+        assert first_frames == second_frames
+
+        summary_log, extracted_frame_count = convert_video_to_images(
+            video_path=video_path,
+            image_dir=image_output_dir,
+            num_frames_target=num_frames_target,
+            num_downscales=num_downscales,
+            crop_factor=crop_factor,
+            verbose=False,
+            random_seed=52,
+        )
+
+        assert mock_run_func.call_count == 6, f"Expected 6 total calls, but got {mock_run_func.call_count}"
+        third_frames = extract_frame_numbers(mock_run_func.call_args[0][0])
+        assert len(third_frames) == 5, f"Expected 5 frames, but got {len(first_frames)}"
+        assert first_frames != third_frames