pytorch · Dan-Flores · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -319,6 +319,42 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
   scannedAllStreams_ = true;
 }
 
+void SingleStreamDecoder::readCustomFrameMappingsUpdateMetadataAndIndex(
+    int streamIndex,
+    std::tuple<at::Tensor, at::Tensor, at::Tensor> customFrameMappings) {
+  auto& all_frames = std::get<0>(customFrameMappings);
+  auto& is_key_frame = std::get<1>(customFrameMappings);
+  auto& duration = std::get<2>(customFrameMappings);
+  TORCH_CHECK(
+      all_frames.size(0) == is_key_frame.size(0) &&
+          is_key_frame.size(0) == duration.size(0),
+      "all_frames, is_key_frame, and duration from custom_frame_mappings were not same size.");
+
+  auto& streamMetadata = containerMetadata_.allStreamMetadata[streamIndex];
+
+  streamMetadata.beginStreamPtsFromContent = all_frames[0].item<int64_t>();
+  streamMetadata.endStreamPtsFromContent =
+      all_frames[-1].item<int64_t>() + duration[-1].item<int64_t>();
+
+  auto avStream = formatContext_->streams[streamIndex];
+  streamMetadata.beginStreamPtsSecondsFromContent =
+      *streamMetadata.beginStreamPtsFromContent * av_q2d(avStream->time_base);
+
+  streamMetadata.endStreamPtsSecondsFromContent =
+      *streamMetadata.endStreamPtsFromContent * av_q2d(avStream->time_base);
+
+  streamMetadata.numFramesFromContent = all_frames.size(0);
+  for (int64_t i = 0; i < all_frames.size(0); ++i) {
+    // FrameInfo struct utilizes PTS
+    FrameInfo frameInfo = {all_frames[i].item<int64_t>()};
+    frameInfo.isKeyFrame = (is_key_frame[i].item<bool>() == true);
+    frameInfo.nextPts = (i + 1 < all_frames.size(0))
+        ? all_frames[i + 1].item<int64_t>()
+        : INT64_MAX;
+    streamInfos_[streamIndex].allFrames.push_back(frameInfo);
+  }
+}
+
 // Sort all frames by their pts. 
 for (auto& [streamIndex, streamInfo] : streamInfos_) { 
   std::sort( 
       streamInfo.keyFrames.begin(), 
       streamInfo.keyFrames.end(), 
       [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) { 
         return frameInfo1.pts < frameInfo2.pts; 
       }); 
   std::sort( 
       streamInfo.allFrames.begin(), 
       streamInfo.allFrames.end(), 
       [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) { 
         return frameInfo1.pts < frameInfo2.pts; 
       }); 
 // Sort all frames by their pts. 
 for (auto& [streamIndex, streamInfo] : streamInfos_) { 
   std::sort( 
       streamInfo.keyFrames.begin(), 
       streamInfo.keyFrames.end(), 
       [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) { 
         return frameInfo1.pts < frameInfo2.pts; 
       }); 
   std::sort( 
       streamInfo.allFrames.begin(), 
       streamInfo.allFrames.end(), 
       [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) { 
         return frameInfo1.pts < frameInfo2.pts; 
       }); 
 ContainerMetadata SingleStreamDecoder::getContainerMetadata() const {
   return containerMetadata_;
 }
@@ -431,7 +467,9 @@ void SingleStreamDecoder::addStream(
 
 void SingleStreamDecoder::addVideoStream(
     int streamIndex,
-    const VideoStreamOptions& videoStreamOptions) {
+    const VideoStreamOptions& videoStreamOptions,
+    std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
+        customFrameMappings) {
   addStream(
       streamIndex,
       AVMEDIA_TYPE_VIDEO,
@@ -456,6 +494,14 @@ void SingleStreamDecoder::addVideoStream(
   streamMetadata.height = streamInfo.codecContext->height;
   streamMetadata.sampleAspectRatio =
       streamInfo.codecContext->sample_aspect_ratio;
+
+  if (seekMode_ == SeekMode::custom_frame_mappings) {
+    TORCH_CHECK(
+        customFrameMappings.has_value(),
+        "Please provide frame mappings when using custom_frame_mappings seek mode.");
+    readCustomFrameMappingsUpdateMetadataAndIndex(
+        streamIndex, customFrameMappings.value());
+  }
 }
 
 void SingleStreamDecoder::addAudioStream(
@@ -1407,6 +1453,7 @@ int SingleStreamDecoder::getKeyFrameIndexForPtsUsingScannedIndex(
 int64_t SingleStreamDecoder::secondsToIndexLowerBound(double seconds) {
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact: {
       auto frame = std::lower_bound(
           streamInfo.allFrames.begin(),
@@ -1434,6 +1481,7 @@ int64_t SingleStreamDecoder::secondsToIndexLowerBound(double seconds) {
 int64_t SingleStreamDecoder::secondsToIndexUpperBound(double seconds) {
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact: {
       auto frame = std::upper_bound(
           streamInfo.allFrames.begin(),
@@ -1461,6 +1509,7 @@ int64_t SingleStreamDecoder::secondsToIndexUpperBound(double seconds) {
 int64_t SingleStreamDecoder::getPts(int64_t frameIndex) {
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact:
       return streamInfo.allFrames[frameIndex].pts;
     case SeekMode::approximate: {
@@ -1485,6 +1534,7 @@ int64_t SingleStreamDecoder::getPts(int64_t frameIndex) {
 std::optional<int64_t> SingleStreamDecoder::getNumFrames(
     const StreamMetadata& streamMetadata) {
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact:
       return streamMetadata.numFramesFromContent.value();
     case SeekMode::approximate: {
@@ -1498,6 +1548,7 @@ std::optional<int64_t> SingleStreamDecoder::getNumFrames(
 double SingleStreamDecoder::getMinSeconds(
     const StreamMetadata& streamMetadata) {
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact:
       return streamMetadata.beginStreamPtsSecondsFromContent.value();
     case SeekMode::approximate:
@@ -1510,6 +1561,7 @@ double SingleStreamDecoder::getMinSeconds(
 std::optional<double> SingleStreamDecoder::getMaxSeconds(
     const StreamMetadata& streamMetadata) {
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact:
       return streamMetadata.endStreamPtsSecondsFromContent.value();
     case SeekMode::approximate: {
@@ -1645,6 +1697,8 @@ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
     return SingleStreamDecoder::SeekMode::exact;
   } else if (seekMode == "approximate") {
     return SingleStreamDecoder::SeekMode::approximate;
+  } else if (seekMode == "custom_frame_mappings") {
+    return SingleStreamDecoder::SeekMode::custom_frame_mappings;
   } else {
     TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode));
   }

diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -29,7 +29,7 @@ class SingleStreamDecoder {
   // CONSTRUCTION API
   // --------------------------------------------------------------------------
 
-  enum class SeekMode { exact, approximate };
+  enum class SeekMode { exact, approximate, custom_frame_mappings };
 if (seekMode_ == SeekMode::exact) { 
   scanFileAndUpdateMetadataAndIndex(); 
 } 
 if (seekMode_ == SeekMode::exact) { 
   scanFileAndUpdateMetadataAndIndex(); 
 } 
 
   // Creates a SingleStreamDecoder from the video at videoFilePath.
   explicit SingleStreamDecoder(
@@ -53,6 +53,13 @@ class SingleStreamDecoder {
   // the allFrames and keyFrames vectors.
   void scanFileAndUpdateMetadataAndIndex();
 
+  // Reads the user provided frame index and updates each StreamInfo's index,
+  // i.e. the allFrames and keyFrames vectors, and
+  // endStreamPtsSecondsFromContent
+  void readCustomFrameMappingsUpdateMetadataAndIndex(
+      int streamIndex,
+      std::tuple<at::Tensor, at::Tensor, at::Tensor> customFrameMappings);
+
   // Returns the metadata for the container.
   ContainerMetadata getContainerMetadata() const;
 
@@ -66,7 +73,9 @@ class SingleStreamDecoder {
 
   void addVideoStream(
       int streamIndex,
-      const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
+      const VideoStreamOptions& videoStreamOptions = VideoStreamOptions(),
+      std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
+          customFrameMappings = std::nullopt);
   void addAudioStream(
       int streamIndex,
       const AudioStreamOptions& audioStreamOptions = AudioStreamOptions());

diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -36,9 +36,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
   m.def(
-      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, str? color_conversion_library=None) -> ()");
+      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
   m.def(
-      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None) -> ()");
+      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
   m.def(
       "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
@@ -223,6 +223,8 @@ void _add_video_stream(
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
     std::optional<std::string_view> device = std::nullopt,
+    std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
+        custom_frame_mappings = std::nullopt,
     std::optional<std::string_view> color_conversion_library = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.width = width;
@@ -253,9 +255,9 @@ void _add_video_stream(
   if (device.has_value()) {
     videoStreamOptions.device = createTorchDevice(std::string(device.value()));
   }
-
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  videoDecoder->addVideoStream(stream_index.value_or(-1), videoStreamOptions);
+  videoDecoder->addVideoStream(
+      stream_index.value_or(-1), videoStreamOptions, custom_frame_mappings);
 }
 
 // Add a new video stream at `stream_index` using the provided options.
@@ -266,15 +268,18 @@ void add_video_stream(
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
-    std::optional<std::string_view> device = std::nullopt) {
+    std::optional<std::string_view> device = std::nullopt,
+    std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
+        custom_frame_mappings = std::nullopt) {
   _add_video_stream(
       decoder,
       width,
       height,
       num_threads,
       dimension_order,
       stream_index,
-      device);
+      device,
+      custom_frame_mappings);
 }
 
 void add_audio_stream(

diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -205,6 +205,9 @@ def _add_video_stream_abstract(
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: Optional[str] = None,
+    custom_frame_mappings: Optional[
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    ] = None,
     color_conversion_library: Optional[str] = None,
 ) -> None:
     return
@@ -220,6 +223,9 @@ def add_video_stream_abstract(
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: Optional[str] = None,
+    custom_frame_mappings: Optional[
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    ] = None,
 ) -> None:
     return
 

diff --git a/test/test_metadata.py b/test/test_metadata.py
@@ -31,23 +31,26 @@ def _get_container_metadata(path, seek_mode):
     return get_container_metadata(decoder)
 
 
-@pytest.mark.parametrize(
-    "metadata_getter",
-    (
-        get_container_metadata_from_header,
-        functools.partial(_get_container_metadata, seek_mode="approximate"),
-        functools.partial(_get_container_metadata, seek_mode="exact"),
-    ),
-)
-def test_get_metadata(metadata_getter):
-    with_scan = (
-        metadata_getter.keywords["seek_mode"] == "exact"
-        if isinstance(metadata_getter, functools.partial)
-        else False
+@pytest.mark.parametrize("seek_mode", ["approximate", "exact", "custom_frame_mappings"])
+def test_get_metadata(seek_mode):
+    from torchcodec._core import add_video_stream
+
+    decoder = create_from_file(str(NASA_VIDEO.path), seek_mode=seek_mode)
+    # For custom_frame_mappings seek mode, add a video stream to update metadata
+    custom_frame_mappings = (
+        NASA_VIDEO.get_custom_frame_mappings()
+        if seek_mode == "custom_frame_mappings"
+        else None
+    )
+    # Add the best video stream (index 3 for NASA_VIDEO)
+    add_video_stream(
+        decoder,
+        stream_index=NASA_VIDEO.default_stream_index,
+        custom_frame_mappings=custom_frame_mappings,
     )
+    metadata = get_container_metadata(decoder)
 
-    metadata = metadata_getter(NASA_VIDEO.path)
-    # metadata = metadata_getter(NASA_VIDEO.path)
+    with_scan = seek_mode == "exact" or seek_mode == "custom_frame_mappings"
 
     assert len(metadata.streams) == 6
     assert metadata.best_video_stream_index == 3
@@ -82,7 +85,7 @@ def test_get_metadata(metadata_getter):
     assert best_video_stream_metadata.begin_stream_seconds_from_header == 0
     assert best_video_stream_metadata.bit_rate == 128783
     assert best_video_stream_metadata.average_fps == pytest.approx(29.97, abs=0.001)
-    assert best_video_stream_metadata.pixel_aspect_ratio is None
+    assert best_video_stream_metadata.pixel_aspect_ratio == Fraction(1, 1)
     assert best_video_stream_metadata.codec == "h264"
     assert best_video_stream_metadata.num_frames_from_content == (
         390 if with_scan else None

diff --git a/test/test_ops.py b/test/test_ops.py
@@ -448,6 +448,69 @@ def test_frame_pts_equality(self):
             )
             assert pts_is_equal
 
+    def test_seek_mode_custom_frame_mappings_fails(self):
+        decoder = create_from_file(
+            str(NASA_VIDEO.path), seek_mode="custom_frame_mappings"
+        )
+        with pytest.raises(
+            RuntimeError,
+            match="Please provide frame mappings when using custom_frame_mappings seek mode.",
+        ):
+            add_video_stream(decoder, stream_index=0, custom_frame_mappings=None)
+
+        decoder = create_from_file(
+            str(NASA_VIDEO.path), seek_mode="custom_frame_mappings"
+        )
+        different_lengths = (
+            torch.tensor([1, 2, 3]),
+            torch.tensor([1, 2]),
+            torch.tensor([1, 2, 3]),
+        )
+        with pytest.raises(
+            RuntimeError,
+            match="all_frames, is_key_frame, and duration from custom_frame_mappings were not same size.",
+        ):
+            add_video_stream(
+                decoder, stream_index=0, custom_frame_mappings=different_lengths
+            )
+
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_seek_mode_custom_frame_mappings(self, device):
+        stream_index = 3  # frame index seek mode requires a stream index
+        decoder = create_from_file(
+            str(NASA_VIDEO.path), seek_mode="custom_frame_mappings"
+        )
+        add_video_stream(
+            decoder,
+            device=device,
+            stream_index=stream_index,
+            custom_frame_mappings=NASA_VIDEO.get_custom_frame_mappings(
+                stream_index=stream_index
+            ),
+        )
+
+        frame0, _, _ = get_next_frame(decoder)
+        reference_frame0 = NASA_VIDEO.get_frame_data_by_index(
+            0, stream_index=stream_index
+        )
+        assert_frames_equal(frame0, reference_frame0.to(device))
+
+        frame6, _, _ = get_frame_at_pts(decoder, 6.006)
+        reference_frame6 = NASA_VIDEO.get_frame_data_by_index(
+            INDEX_OF_FRAME_AT_6_SECONDS, stream_index=stream_index
+        )
+        assert_frames_equal(frame6, reference_frame6.to(device))
+
+        frame6, _, _ = get_frame_at_index(decoder, frame_index=180)
+        reference_frame6 = NASA_VIDEO.get_frame_data_by_index(
+            INDEX_OF_FRAME_AT_6_SECONDS, stream_index=stream_index
+        )
+        assert_frames_equal(frame6, reference_frame6.to(device))
+
+        ref_frames0_9 = NASA_VIDEO.get_frame_data_by_range(0, 9)
+        bulk_frames0_9, *_ = get_frames_in_range(decoder, start=0, stop=9)
+        assert_frames_equal(bulk_frames0_9, ref_frames0_9.to(device))
+
     @pytest.mark.parametrize("color_conversion_library", ("filtergraph", "swscale"))
     def test_color_conversion_library(self, color_conversion_library):
         decoder = create_from_file(str(NASA_VIDEO.path))