Speed-up time-based samplers by 20X and index-based by 1.5X (#284)

NicolasHug · web-flow · commit ebc809cea014 · 2024-10-24T17:11:38.000+01:00
diff --git a/src/torchcodec/samplers/_common.py b/src/torchcodec/samplers/_common.py
@@ -1,7 +1,7 @@
 from typing import Callable, Union
 
-import torch
-from torchcodec import Frame, FrameBatch
+from torch import Tensor
+from torchcodec import FrameBatch
 
 _LIST_OF_INT_OR_FLOAT = Union[list[int], list[float]]
 
@@ -42,22 +42,6 @@ def _error_policy(
 }
 
 
-def _chunk_list(lst, chunk_size):
-    # return list of sublists of length chunk_size
-    return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
-
-
-def _to_framebatch(frames: list[Frame]) -> FrameBatch:
-    # IMPORTANT: see other IMPORTANT note in _decode_all_clips_indices and
-    # _decode_all_clips_timestamps
-    data = torch.stack([frame.data for frame in frames])
-    pts_seconds = torch.tensor([frame.pts_seconds for frame in frames])
-    duration_seconds = torch.tensor([frame.duration_seconds for frame in frames])
-    return FrameBatch(
-        data=data, pts_seconds=pts_seconds, duration_seconds=duration_seconds
-    )
-
-
 def _validate_common_params(*, decoder, num_frames_per_clip, policy):
     if len(decoder) < 1:
         raise ValueError(
@@ -72,3 +56,19 @@ def _validate_common_params(*, decoder, num_frames_per_clip, policy):
         raise ValueError(
             f"Invalid policy ({policy}). Supported values are {_POLICY_FUNCTIONS.keys()}."
         )
+
+
+def _make_5d_framebatch(
+    *,
+    data: Tensor,
+    pts_seconds: Tensor,
+    duration_seconds: Tensor,
+    num_clips: int,
+    num_frames_per_clip: int,
+) -> FrameBatch:
+    last_3_dims = data.shape[-3:]
+    return FrameBatch(
+        data=data.view(num_clips, num_frames_per_clip, *last_3_dims),
+        pts_seconds=pts_seconds.view(num_clips, num_frames_per_clip),
+        duration_seconds=duration_seconds.view(num_clips, num_frames_per_clip),
+    )
diff --git a/src/torchcodec/samplers/_index_based.py b/src/torchcodec/samplers/_index_based.py
@@ -1,14 +1,14 @@
-from typing import List, Literal, Optional
+from typing import Literal, Optional
 
 import torch
 
-from torchcodec import Frame, FrameBatch
+from torchcodec import FrameBatch
 from torchcodec.decoders import VideoDecoder
+from torchcodec.decoders._core import get_frames_at_indices
 from torchcodec.samplers._common import (
-    _chunk_list,
+    _make_5d_framebatch,
     _POLICY_FUNCTION_TYPE,
     _POLICY_FUNCTIONS,
-    _to_framebatch,
     _validate_common_params,
 )
 
@@ -117,51 +117,6 @@ def _build_all_clips_indices(
     return all_clips_indices
 
 
-def _decode_all_clips_indices(
-    decoder: VideoDecoder, all_clips_indices: list[int], num_frames_per_clip: int
-) -> list[FrameBatch]:
-    # This takes the list of all the frames to decode (in arbitrary order),
-    # decode all the frames, and then packs them into clips of length
-    # num_frames_per_clip.
-    #
-    # To avoid backwards seeks (which are slow), we:
-    # - sort all the frame indices to be decoded
-    # - dedup them
-    # - decode all unique frames in sorted order
-    # - re-assemble the decoded frames back to their original order
-    #
-    # TODO: Write this in C++ so we can avoid the copies that happen in `_to_framebatch`
-
-    all_clips_indices_sorted, argsort = zip(
-        *sorted((frame_index, i) for (i, frame_index) in enumerate(all_clips_indices))
-    )
-    previous_decoded_frame = None
-    all_decoded_frames = [None] * len(all_clips_indices)
-    for i, j in enumerate(argsort):
-        frame_index = all_clips_indices_sorted[i]
-        if (
-            previous_decoded_frame is not None  # then we know i > 0
-            and frame_index == all_clips_indices_sorted[i - 1]
-        ):
-            # Avoid decoding the same frame twice.
-            # IMPORTANT: this is only correct because a copy of the frame will
-            # happen within `_to_framebatch` when we call torch.stack.
-            # If a copy isn't made, the same underlying memory will be used for
-            # the 2 consecutive frames. When we re-write this, we should make
-            # sure to explicitly copy the data.
-            decoded_frame = previous_decoded_frame
-        else:
-            decoded_frame = decoder.get_frame_at(index=frame_index)
-        previous_decoded_frame = decoded_frame
-        all_decoded_frames[j] = decoded_frame
-
-    all_clips: list[list[Frame]] = _chunk_list(
-        all_decoded_frames, chunk_size=num_frames_per_clip
-    )
-
-    return [_to_framebatch(clip) for clip in all_clips]
-
-
 def _generic_index_based_sampler(
     kind: Literal["random", "regular"],
     decoder: VideoDecoder,
@@ -174,7 +129,7 @@ def _generic_index_based_sampler(
     # Important note: sampling_range_end defines the upper bound of where a clip
     # can *start*, not where a clip can end.
     policy: Literal["repeat_last", "wrap", "error"],
-) -> List[FrameBatch]:
+) -> FrameBatch:
 
     _validate_common_params(
         decoder=decoder,
@@ -221,9 +176,18 @@ def _generic_index_based_sampler(
         num_frames_in_video=len(decoder),
         policy_fun=_POLICY_FUNCTIONS[policy],
     )
-    return _decode_all_clips_indices(
-        decoder,
-        all_clips_indices=all_clips_indices,
+
+    # TODO: Use public method of decoder, when it exists
+    frames, pts_seconds, duration_seconds = get_frames_at_indices(
+        decoder._decoder,
+        stream_index=decoder.stream_index,
+        frame_indices=all_clips_indices,
+    )
+    return _make_5d_framebatch(
+        data=frames,
+        pts_seconds=pts_seconds,
+        duration_seconds=duration_seconds,
+        num_clips=num_clips,
         num_frames_per_clip=num_frames_per_clip,
     )
 
@@ -237,7 +201,7 @@ def clips_at_random_indices(
     sampling_range_start: int = 0,
     sampling_range_end: Optional[int] = None,  # interval is [start, end).
     policy: Literal["repeat_last", "wrap", "error"] = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
     return _generic_index_based_sampler(
         kind="random",
         decoder=decoder,
@@ -259,7 +223,7 @@ def clips_at_regular_indices(
     sampling_range_start: int = 0,
     sampling_range_end: Optional[int] = None,  # interval is [start, end).
     policy: Literal["repeat_last", "wrap", "error"] = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
 
     return _generic_index_based_sampler(
         kind="regular",
diff --git a/src/torchcodec/samplers/_time_based.py b/src/torchcodec/samplers/_time_based.py
@@ -1,14 +1,13 @@
-from typing import List, Literal, Optional
+from typing import Literal, Optional
 
 import torch
 
-from torchcodec import Frame, FrameBatch
-from torchcodec.decoders import VideoDecoder
+from torchcodec import FrameBatch
+from torchcodec.decoders._core import get_frames_by_pts
 from torchcodec.samplers._common import (
-    _chunk_list,
+    _make_5d_framebatch,
     _POLICY_FUNCTION_TYPE,
     _POLICY_FUNCTIONS,
-    _to_framebatch,
     _validate_common_params,
 )
 
@@ -147,51 +146,6 @@ def _build_all_clips_timestamps(
     return all_clips_timestamps
 
 
-def _decode_all_clips_timestamps(
-    decoder: VideoDecoder, all_clips_timestamps: list[float], num_frames_per_clip: int
-) -> list[FrameBatch]:
-    # This is 99% the same as _decode_all_clips_indices. The only change is the
-    # call to .get_frame_displayed_at(pts) instead of .get_frame_at(idx)
-
-    all_clips_timestamps_sorted, argsort = zip(
-        *sorted(
-            (frame_index, i) for (i, frame_index) in enumerate(all_clips_timestamps)
-        )
-    )
-    previous_decoded_frame = None
-    all_decoded_frames = [None] * len(all_clips_timestamps)
-    for i, j in enumerate(argsort):
-        frame_pts_seconds = all_clips_timestamps_sorted[i]
-        if (
-            previous_decoded_frame is not None  # then we know i > 0
-            and frame_pts_seconds == all_clips_timestamps_sorted[i - 1]
-        ):
-            # Avoid decoding the same frame twice.
-            # Unfortunatly this is unlikely to lead to speed-up as-is: it's
-            # pretty unlikely that 2 pts will be the same since pts are float
-            # contiguous values. Theoretically the dedup can still happen, but
-            # it would be much more efficient to implement it at the frame index
-            # level. We should do that once we implement that in C++.
-            # See also https://github.com/pytorch/torchcodec/issues/256.
-            #
-            # IMPORTANT: this is only correct because a copy of the frame will
-            # happen within `_to_framebatch` when we call torch.stack.
-            # If a copy isn't made, the same underlying memory will be used for
-            # the 2 consecutive frames. When we re-write this, we should make
-            # sure to explicitly copy the data.
-            decoded_frame = previous_decoded_frame
-        else:
-            decoded_frame = decoder.get_frame_displayed_at(seconds=frame_pts_seconds)
-        previous_decoded_frame = decoded_frame
-        all_decoded_frames[j] = decoded_frame
-
-    all_clips: list[list[Frame]] = _chunk_list(
-        all_decoded_frames, chunk_size=num_frames_per_clip
-    )
-
-    return [_to_framebatch(clip) for clip in all_clips]
-
-
 def _generic_time_based_sampler(
     kind: Literal["random", "regular"],
     decoder,
@@ -204,7 +158,7 @@ def _generic_time_based_sampler(
     sampling_range_start: Optional[float],
     sampling_range_end: Optional[float],  # interval is [start, end).
     policy: str = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
     # Note: *everywhere*, sampling_range_end denotes the upper bound of where a
     # clip can start. This is an *open* upper bound, i.e. we will make sure no
     # clip starts exactly at (or above) sampling_range_end.
@@ -246,6 +200,7 @@ def _generic_time_based_sampler(
             sampling_range_end,  # excluded
             seconds_between_clip_starts,
         )
+        num_clips = len(clip_start_seconds)
 
     all_clips_timestamps = _build_all_clips_timestamps(
         clip_start_seconds=clip_start_seconds,
@@ -255,9 +210,17 @@ def _generic_time_based_sampler(
         policy_fun=_POLICY_FUNCTIONS[policy],
     )
 
-    return _decode_all_clips_timestamps(
-        decoder,
-        all_clips_timestamps=all_clips_timestamps,
+    # TODO: Use public method of decoder, when it exists
+    frames, pts_seconds, duration_seconds = get_frames_by_pts(
+        decoder._decoder,
+        stream_index=decoder.stream_index,
+        timestamps=all_clips_timestamps,
+    )
+    return _make_5d_framebatch(
+        data=frames,
+        pts_seconds=pts_seconds,
+        duration_seconds=duration_seconds,
+        num_clips=num_clips,
         num_frames_per_clip=num_frames_per_clip,
     )
 
@@ -272,7 +235,7 @@ def clips_at_random_timestamps(
     sampling_range_start: Optional[float] = None,
     sampling_range_end: Optional[float] = None,  # interval is [start, end).
     policy: str = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
     return _generic_time_based_sampler(
         kind="random",
         decoder=decoder,
@@ -296,7 +259,7 @@ def clips_at_regular_timestamps(
     sampling_range_start: Optional[float] = None,
     sampling_range_end: Optional[float] = None,  # interval is [start, end).
     policy: str = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
     return _generic_time_based_sampler(
         kind="regular",
         decoder=decoder,
diff --git a/test/samplers/test_samplers.py b/test/samplers/test_samplers.py