meta-pytorch
diff --git a/‎src/torchcodec/_core/BetaCudaDeviceInterface.cpp‎
Lines changed: 10 additions & 0 deletions b/‎src/torchcodec/_core/BetaCudaDeviceInterface.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/BetaCudaDeviceInterface.h‎
Lines changed: 6 additions & 0 deletions b/‎src/torchcodec/_core/BetaCudaDeviceInterface.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/CUDACommon.cpp‎
Lines changed: 77 additions & 0 deletions b/‎src/torchcodec/_core/CUDACommon.cpp‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/CUDACommon.h‎
Lines changed: 7 additions & 0 deletions b/‎src/torchcodec/_core/CUDACommon.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 78 additions & 0 deletions b/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/CpuDeviceInterface.h‎
Lines changed: 6 additions & 0 deletions b/‎src/torchcodec/_core/CpuDeviceInterface.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/CudaDeviceInterface.cpp‎
Lines changed: 68 additions & 0 deletions b/‎src/torchcodec/_core/CudaDeviceInterface.cpp‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/CudaDeviceInterface.h‎
Lines changed: 6 additions & 0 deletions b/‎src/torchcodec/_core/CudaDeviceInterface.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/DeviceInterface.h‎
Lines changed: 8 additions & 0 deletions b/‎src/torchcodec/_core/DeviceInterface.h‎
Lines changed: 8 additions & 0 deletions
@@ -833,6 +833,16 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
       gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
 }
 
+UniqueAVFrame BetaCudaDeviceInterface::convertTensorToAVFrame(
+    [[maybe_unused]] const torch::Tensor& tensor,
+    [[maybe_unused]] AVPixelFormat targetFormat,
+    [[maybe_unused]] int frameIndex,
+    [[maybe_unused]] AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      false,
+      "Beta CUDA device interface does not support video encoding currently.");
+}
+
 std::string BetaCudaDeviceInterface::getDetails() {
   std::string details = "Beta CUDA Device Interface.";
   if (cpuFallback_) {
 
@@ -48,6 +48,12 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
+  UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext) override;
+
   int sendPacket(ReferenceAVPacket& packet) override;
   int sendEOFPacket() override;
   int receiveFrame(UniqueAVFrame& avFrame) override;
 
@@ -156,6 +156,21 @@ const Npp32f bt709FullRangeColorTwist[3][4] = {
     {1.0f, -0.187324273f, -0.468124273f, -128.0f},
     {1.0f, 1.8556f, 0.0f, -128.0f}};
 
+// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
+// Note: NPP's ColorTwist function apparently expects "limited range"
+// coefficient format even when producing full range output. All matrices below
+// use the limited range coefficient format (Y with +16 offset) for NPP
+// compatibility.
+
+// BT.601 limited range (matches FFmpeg default behavior)
+const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
+    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
+    {0.257f, 0.504f, 0.098f, 16.0f},
+    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
+    {-0.148f, -0.291f, 0.439f, 128.0f},
+    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
+    {0.439f, -0.368f, -0.071f, 128.0f}};
+
 torch::Tensor convertNV12FrameToRGB(
     UniqueAVFrame& avFrame,
     const torch::Device& device,
@@ -246,6 +261,68 @@ torch::Tensor convertNV12FrameToRGB(
   return dst;
 }
 
+void convertRGBTensorToNV12Frame(
+    const torch::Tensor& rgbTensor,
+    UniqueAVFrame& nv12Frame,
+    const torch::Device& device,
+    const UniqueNppContext& nppCtx,
+    at::cuda::CUDAStream inputStream) {
+  TORCH_CHECK(rgbTensor.is_cuda(), "RGB tensor must be on CUDA device");
+  TORCH_CHECK(
+      rgbTensor.dim() == 3 && rgbTensor.size(0) == 3,
+      "Expected 3D RGB tensor in CHW format, got shape: ",
+      rgbTensor.sizes());
+  TORCH_CHECK(
+      nv12Frame != nullptr && nv12Frame->data[0] != nullptr,
+      "nv12Frame must be pre-allocated with CUDA memory");
+
+  // Convert CHW to HWC for NPP processing
+  int height = static_cast<int>(rgbTensor.size(1));
+  int width = static_cast<int>(rgbTensor.size(2));
+  torch::Tensor hwcFrame = rgbTensor.permute({1, 2, 0}).contiguous();
+
+  // Set up stream synchronization - make NPP stream wait for input tensor
+  // operations
+  at::cuda::CUDAStream nppStream =
+      at::cuda::getCurrentCUDAStream(device.index());
+  at::cuda::CUDAEvent inputDoneEvent;
+  inputDoneEvent.record(inputStream);
+  inputDoneEvent.block(nppStream);
+
+  // Setup NPP context
+  nppCtx->hStream = nppStream.stream();
+  cudaError_t cudaErr =
+      cudaStreamGetFlags(nppCtx->hStream, &nppCtx->nStreamFlags);
+  TORCH_CHECK(
+      cudaErr == cudaSuccess,
+      "cudaStreamGetFlags failed: ",
+      cudaGetErrorString(cudaErr));
+
+  // Always use FFmpeg's default behavior: BT.601 limited range
+  NppiSize oSizeROI = {width, height};
+
+  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
+      static_cast<const Npp8u*>(hwcFrame.data_ptr()),
+      hwcFrame.stride(0) * hwcFrame.element_size(),
+      nv12Frame->data,
+      nv12Frame->linesize,
+      oSizeROI,
+      defaultLimitedRangeRgbToNv12,
+      *nppCtx);
+
+  TORCH_CHECK(
+      status == NPP_SUCCESS,
+      "Failed to convert RGB to NV12: NPP error code ",
+      status);
+
+  // Validate CUDA operations completed successfully
+  cudaError_t memCheck = cudaGetLastError();
+  TORCH_CHECK(
+      memCheck == cudaSuccess,
+      "CUDA error detected: ",
+      cudaGetErrorString(memCheck));
+}
+
 UniqueNppContext getNppStreamContext(const torch::Device& device) {
   int deviceIndex = getDeviceIndex(device);
 
 
@@ -37,6 +37,13 @@ torch::Tensor convertNV12FrameToRGB(
     at::cuda::CUDAStream nvdecStream,
     std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
+void convertRGBTensorToNV12Frame(
+    const torch::Tensor& rgbTensor,
+    UniqueAVFrame& nv12Frame,
+    const torch::Device& device,
+    const UniqueNppContext& nppCtx,
+    at::cuda::CUDAStream inputStream);
+
 UniqueNppContext getNppStreamContext(const torch::Device& device);
 void returnNppStreamContextToCache(
     const torch::Device& device,
 
@@ -429,6 +429,84 @@ std::optional<torch::Tensor> CpuDeviceInterface::maybeFlushAudioBuffers() {
       /*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
 }
 
+UniqueAVFrame CpuDeviceInterface::convertTensorToAVFrame(
+    const torch::Tensor& frame,
+    AVPixelFormat outPixelFormat,
+    int frameIndex,
+    [[maybe_unused]] AVCodecContext* codecContext) {
+  int inHeight = static_cast<int>(frame.sizes()[1]);
+  int inWidth = static_cast<int>(frame.sizes()[2]);
+
+  // For now, reuse input dimensions as output dimensions
+  int outWidth = inWidth;
+  int outHeight = inHeight;
+
+  // Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering)
+  AVPixelFormat inPixelFormat = AV_PIX_FMT_GBRP;
+
+  // Initialize and cache scaling context if it does not exist
+  if (!swsContext_) {
+    swsContext_.reset(sws_getContext(
+        inWidth,
+        inHeight,
+        inPixelFormat,
+        outWidth,
+        outHeight,
+        outPixelFormat,
+        SWS_BICUBIC, // Used by FFmpeg CLI
+        nullptr,
+        nullptr,
+        nullptr));
+    TORCH_CHECK(swsContext_ != nullptr, "Failed to create scaling context");
+  }
+
+  UniqueAVFrame avFrame(av_frame_alloc());
+  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
+
+  // Set output frame properties
+  avFrame->format = outPixelFormat;
+  avFrame->width = outWidth;
+  avFrame->height = outHeight;
+  avFrame->pts = frameIndex;
+
+  int status = av_frame_get_buffer(avFrame.get(), 0);
+  TORCH_CHECK(status >= 0, "Failed to allocate frame buffer");
+
+  // Need to convert/scale the frame
+  // Create temporary frame with input format
+  UniqueAVFrame inputFrame(av_frame_alloc());
+  TORCH_CHECK(inputFrame != nullptr, "Failed to allocate input AVFrame");
+
+  inputFrame->format = inPixelFormat;
+  inputFrame->width = inWidth;
+  inputFrame->height = inHeight;
+
+  uint8_t* tensorData = static_cast<uint8_t*>(frame.data_ptr());
+
+  // TODO-VideoEncoder: Reorder tensor if in NHWC format
+  int channelSize = inHeight * inWidth;
+  // Reorder RGB -> GBR for AV_PIX_FMT_GBRP format
+  // TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format
+  inputFrame->data[0] = tensorData + channelSize;
+  inputFrame->data[1] = tensorData + (2 * channelSize);
+  inputFrame->data[2] = tensorData;
+
+  inputFrame->linesize[0] = inWidth;
+  inputFrame->linesize[1] = inWidth;
+  inputFrame->linesize[2] = inWidth;
+
+  status = sws_scale(
+      swsContext_.get(),
+      inputFrame->data,
+      inputFrame->linesize,
+      0,
+      inputFrame->height,
+      avFrame->data,
+      avFrame->linesize);
+  TORCH_CHECK(status == outHeight, "sws_scale failed");
+  return avFrame;
+}
+
 std::string CpuDeviceInterface::getDetails() {
   return std::string("CPU Device Interface.");
 }
 
@@ -38,6 +38,12 @@ class CpuDeviceInterface : public DeviceInterface {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
+  UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext) override;
+
   std::string getDetails() override;
 
  private:
 
@@ -1,8 +1,10 @@
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
 #include <torch/types.h>
 #include <mutex>
 
+#include "CUDACommon.h"
 #include "Cache.h"
 #include "CudaDeviceInterface.h"
 #include "FFMPEGCommon.h"
@@ -142,6 +144,34 @@ void CudaDeviceInterface::registerHardwareDeviceWithCodec(
       hardwareDeviceCtx_, "Hardware device context has not been initialized");
   TORCH_CHECK(codecContext != nullptr, "codecContext is null");
   codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
+  // is there any way to preserve actual desired format?
+  // codecContext->sw_pix_fmt = codecContext->pix_fmt;
+  // Should we always produce AV_PIX_FMT_NV12?
+  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
+  codecContext->pix_fmt = AV_PIX_FMT_CUDA;
+
+  AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
+  TORCH_CHECK(
+      hwFramesCtxRef != nullptr,
+      "Failed to allocate hardware frames context for codec");
+
+  AVHWFramesContext* hwFramesCtx =
+      reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
+  hwFramesCtx->format = codecContext->pix_fmt;
+  hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
+  hwFramesCtx->width = codecContext->width;
+  hwFramesCtx->height = codecContext->height;
+
+  int ret = av_hwframe_ctx_init(hwFramesCtxRef);
+  if (ret < 0) {
+    av_buffer_unref(&hwFramesCtxRef);
+    TORCH_CHECK(
+        false,
+        "Failed to initialize CUDA frames context for codec: ",
+        getFFMPEGErrorStringFromErrorCode(ret));
+  }
+
+  codecContext->hw_frames_ctx = hwFramesCtxRef;
 }
 
 UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
@@ -379,6 +409,44 @@ std::optional<const AVCodec*> CudaDeviceInterface::findDecoder(
   return std::nullopt;
 }
 
+UniqueAVFrame CudaDeviceInterface::convertTensorToAVFrame(
+    const torch::Tensor& frame,
+    [[maybe_unused]] AVPixelFormat targetFormat,
+    int frameIndex,
+    AVCodecContext* codecContext) {
+  TORCH_CHECK(frame.is_cuda(), "CUDA device interface requires CUDA tensors");
+  TORCH_CHECK(
+      frame.dim() == 3 && frame.size(0) == 3,
+      "Expected 3D RGB tensor (CHW format), got shape: ",
+      frame.sizes());
+
+  UniqueAVFrame avFrame(av_frame_alloc());
+  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
+
+  avFrame->format = AV_PIX_FMT_CUDA;
+  avFrame->width = static_cast<int>(frame.size(2));
+  avFrame->height = static_cast<int>(frame.size(1));
+  avFrame->pts = frameIndex;
+
+  int ret = av_hwframe_get_buffer(
+      codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0);
+  TORCH_CHECK(
+      ret >= 0,
+      "Failed to allocate hardware frame: ",
+      getFFMPEGErrorStringFromErrorCode(ret));
+
+  at::cuda::CUDAStream currentStream =
+      at::cuda::getCurrentCUDAStream(device_.index());
+
+  convertRGBTensorToNV12Frame(frame, avFrame, device_, nppCtx_, currentStream);
+
+  // Set color properties to FFmpeg defaults
+  avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
+  avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
+
+  return avFrame;
+}
+
 std::string CudaDeviceInterface::getDetails() {
   // Note: for this interface specifically the fallback is only known after a
   // frame has been decoded, not before: that's when FFmpeg decides to fallback,
 
@@ -40,6 +40,12 @@ class CudaDeviceInterface : public DeviceInterface {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
+  UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext) override;
+
   std::string getDetails() override;
 
  private:
 
@@ -97,6 +97,14 @@ class DeviceInterface {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
 
+  // Convert tensor to AVFrame, implemented per device interface.
+  // This is similar to convertAVFrameToFrameOutput for encoding
+  virtual UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext) = 0;
+
   // ------------------------------------------
   // Extension points for custom decoding paths
   // ------------------------------------------