Skip to content

Commit 0246410

Browse files
committed
BT.601, test_nvenc_against_ffmpeg_cli
1 parent a3d520b commit 0246410

15 files changed

+398
-153
lines changed

src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,16 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
833833
gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
834834
}
835835

836+
UniqueAVFrame BetaCudaDeviceInterface::convertTensorToAVFrame(
837+
[[maybe_unused]] const torch::Tensor& tensor,
838+
[[maybe_unused]] AVPixelFormat targetFormat,
839+
[[maybe_unused]] int frameIndex,
840+
[[maybe_unused]] AVCodecContext* codecContext) {
841+
TORCH_CHECK(
842+
false,
843+
"Beta CUDA device interface does not support video encoding currently.");
844+
}
845+
836846
std::string BetaCudaDeviceInterface::getDetails() {
837847
std::string details = "Beta CUDA Device Interface.";
838848
if (cpuFallback_) {

src/torchcodec/_core/BetaCudaDeviceInterface.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ class BetaCudaDeviceInterface : public DeviceInterface {
4848
FrameOutput& frameOutput,
4949
std::optional<torch::Tensor> preAllocatedOutputTensor) override;
5050

51+
UniqueAVFrame convertTensorToAVFrame(
52+
const torch::Tensor& tensor,
53+
AVPixelFormat targetFormat,
54+
int frameIndex,
55+
AVCodecContext* codecContext) override;
56+
5157
int sendPacket(ReferenceAVPacket& packet) override;
5258
int sendEOFPacket() override;
5359
int receiveFrame(UniqueAVFrame& avFrame) override;

src/torchcodec/_core/CUDACommon.cpp

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,21 @@ const Npp32f bt709FullRangeColorTwist[3][4] = {
156156
{1.0f, -0.187324273f, -0.468124273f, -128.0f},
157157
{1.0f, 1.8556f, 0.0f, -128.0f}};
158158

159+
// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
160+
// Note: NPP's ColorTwist function apparently expects "limited range"
161+
// coefficient format even when producing full range output. All matrices below
162+
// use the limited range coefficient format (Y with +16 offset) for NPP
163+
// compatibility.
164+
165+
// BT.601 limited range (matches FFmpeg default behavior)
166+
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
167+
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
168+
{0.257f, 0.504f, 0.098f, 16.0f},
169+
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
170+
{-0.148f, -0.291f, 0.439f, 128.0f},
171+
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
172+
{0.439f, -0.368f, -0.071f, 128.0f}};
173+
159174
torch::Tensor convertNV12FrameToRGB(
160175
UniqueAVFrame& avFrame,
161176
const torch::Device& device,
@@ -246,6 +261,68 @@ torch::Tensor convertNV12FrameToRGB(
246261
return dst;
247262
}
248263

264+
void convertRGBTensorToNV12Frame(
265+
const torch::Tensor& rgbTensor,
266+
UniqueAVFrame& nv12Frame,
267+
const torch::Device& device,
268+
const UniqueNppContext& nppCtx,
269+
at::cuda::CUDAStream inputStream) {
270+
TORCH_CHECK(rgbTensor.is_cuda(), "RGB tensor must be on CUDA device");
271+
TORCH_CHECK(
272+
rgbTensor.dim() == 3 && rgbTensor.size(0) == 3,
273+
"Expected 3D RGB tensor in CHW format, got shape: ",
274+
rgbTensor.sizes());
275+
TORCH_CHECK(
276+
nv12Frame != nullptr && nv12Frame->data[0] != nullptr,
277+
"nv12Frame must be pre-allocated with CUDA memory");
278+
279+
// Convert CHW to HWC for NPP processing
280+
int height = static_cast<int>(rgbTensor.size(1));
281+
int width = static_cast<int>(rgbTensor.size(2));
282+
torch::Tensor hwcFrame = rgbTensor.permute({1, 2, 0}).contiguous();
283+
284+
// Set up stream synchronization - make NPP stream wait for input tensor
285+
// operations
286+
at::cuda::CUDAStream nppStream =
287+
at::cuda::getCurrentCUDAStream(device.index());
288+
at::cuda::CUDAEvent inputDoneEvent;
289+
inputDoneEvent.record(inputStream);
290+
inputDoneEvent.block(nppStream);
291+
292+
// Setup NPP context
293+
nppCtx->hStream = nppStream.stream();
294+
cudaError_t cudaErr =
295+
cudaStreamGetFlags(nppCtx->hStream, &nppCtx->nStreamFlags);
296+
TORCH_CHECK(
297+
cudaErr == cudaSuccess,
298+
"cudaStreamGetFlags failed: ",
299+
cudaGetErrorString(cudaErr));
300+
301+
// Always use FFmpeg's default behavior: BT.601 limited range
302+
NppiSize oSizeROI = {width, height};
303+
304+
NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
305+
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
306+
hwcFrame.stride(0) * hwcFrame.element_size(),
307+
nv12Frame->data,
308+
nv12Frame->linesize,
309+
oSizeROI,
310+
defaultLimitedRangeRgbToNv12,
311+
*nppCtx);
312+
313+
TORCH_CHECK(
314+
status == NPP_SUCCESS,
315+
"Failed to convert RGB to NV12: NPP error code ",
316+
status);
317+
318+
// Validate CUDA operations completed successfully
319+
cudaError_t memCheck = cudaGetLastError();
320+
TORCH_CHECK(
321+
memCheck == cudaSuccess,
322+
"CUDA error detected: ",
323+
cudaGetErrorString(memCheck));
324+
}
325+
249326
UniqueNppContext getNppStreamContext(const torch::Device& device) {
250327
int deviceIndex = getDeviceIndex(device);
251328

src/torchcodec/_core/CUDACommon.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,13 @@ torch::Tensor convertNV12FrameToRGB(
3737
at::cuda::CUDAStream nvdecStream,
3838
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
3939

40+
void convertRGBTensorToNV12Frame(
41+
const torch::Tensor& rgbTensor,
42+
UniqueAVFrame& nv12Frame,
43+
const torch::Device& device,
44+
const UniqueNppContext& nppCtx,
45+
at::cuda::CUDAStream inputStream);
46+
4047
UniqueNppContext getNppStreamContext(const torch::Device& device);
4148
void returnNppStreamContextToCache(
4249
const torch::Device& device,

src/torchcodec/_core/CpuDeviceInterface.cpp

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,84 @@ std::optional<torch::Tensor> CpuDeviceInterface::maybeFlushAudioBuffers() {
429429
/*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
430430
}
431431

432+
UniqueAVFrame CpuDeviceInterface::convertTensorToAVFrame(
433+
const torch::Tensor& frame,
434+
AVPixelFormat outPixelFormat,
435+
int frameIndex,
436+
[[maybe_unused]] AVCodecContext* codecContext) {
437+
int inHeight = static_cast<int>(frame.sizes()[1]);
438+
int inWidth = static_cast<int>(frame.sizes()[2]);
439+
440+
// For now, reuse input dimensions as output dimensions
441+
int outWidth = inWidth;
442+
int outHeight = inHeight;
443+
444+
// Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering)
445+
AVPixelFormat inPixelFormat = AV_PIX_FMT_GBRP;
446+
447+
// Initialize and cache scaling context if it does not exist
448+
if (!swsContext_) {
449+
swsContext_.reset(sws_getContext(
450+
inWidth,
451+
inHeight,
452+
inPixelFormat,
453+
outWidth,
454+
outHeight,
455+
outPixelFormat,
456+
SWS_BICUBIC, // Used by FFmpeg CLI
457+
nullptr,
458+
nullptr,
459+
nullptr));
460+
TORCH_CHECK(swsContext_ != nullptr, "Failed to create scaling context");
461+
}
462+
463+
UniqueAVFrame avFrame(av_frame_alloc());
464+
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
465+
466+
// Set output frame properties
467+
avFrame->format = outPixelFormat;
468+
avFrame->width = outWidth;
469+
avFrame->height = outHeight;
470+
avFrame->pts = frameIndex;
471+
472+
int status = av_frame_get_buffer(avFrame.get(), 0);
473+
TORCH_CHECK(status >= 0, "Failed to allocate frame buffer");
474+
475+
// Need to convert/scale the frame
476+
// Create temporary frame with input format
477+
UniqueAVFrame inputFrame(av_frame_alloc());
478+
TORCH_CHECK(inputFrame != nullptr, "Failed to allocate input AVFrame");
479+
480+
inputFrame->format = inPixelFormat;
481+
inputFrame->width = inWidth;
482+
inputFrame->height = inHeight;
483+
484+
uint8_t* tensorData = static_cast<uint8_t*>(frame.data_ptr());
485+
486+
// TODO-VideoEncoder: Reorder tensor if in NHWC format
487+
int channelSize = inHeight * inWidth;
488+
// Reorder RGB -> GBR for AV_PIX_FMT_GBRP format
489+
// TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format
490+
inputFrame->data[0] = tensorData + channelSize;
491+
inputFrame->data[1] = tensorData + (2 * channelSize);
492+
inputFrame->data[2] = tensorData;
493+
494+
inputFrame->linesize[0] = inWidth;
495+
inputFrame->linesize[1] = inWidth;
496+
inputFrame->linesize[2] = inWidth;
497+
498+
status = sws_scale(
499+
swsContext_.get(),
500+
inputFrame->data,
501+
inputFrame->linesize,
502+
0,
503+
inputFrame->height,
504+
avFrame->data,
505+
avFrame->linesize);
506+
TORCH_CHECK(status == outHeight, "sws_scale failed");
507+
return avFrame;
508+
}
509+
432510
std::string CpuDeviceInterface::getDetails() {
433511
return std::string("CPU Device Interface.");
434512
}

src/torchcodec/_core/CpuDeviceInterface.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ class CpuDeviceInterface : public DeviceInterface {
3838
FrameOutput& frameOutput,
3939
std::optional<torch::Tensor> preAllocatedOutputTensor) override;
4040

41+
UniqueAVFrame convertTensorToAVFrame(
42+
const torch::Tensor& tensor,
43+
AVPixelFormat targetFormat,
44+
int frameIndex,
45+
AVCodecContext* codecContext) override;
46+
4147
std::string getDetails() override;
4248

4349
private:

src/torchcodec/_core/CudaDeviceInterface.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
#include <ATen/cuda/CUDAEvent.h>
22
#include <c10/cuda/CUDAStream.h>
3+
#include <cuda_runtime.h>
34
#include <torch/types.h>
45
#include <mutex>
56

7+
#include "CUDACommon.h"
68
#include "Cache.h"
79
#include "CudaDeviceInterface.h"
810
#include "FFMPEGCommon.h"
@@ -142,6 +144,34 @@ void CudaDeviceInterface::registerHardwareDeviceWithCodec(
142144
hardwareDeviceCtx_, "Hardware device context has not been initialized");
143145
TORCH_CHECK(codecContext != nullptr, "codecContext is null");
144146
codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
147+
// is there any way to preserve actual desired format?
148+
// codecContext->sw_pix_fmt = codecContext->pix_fmt;
149+
// Should we always produce AV_PIX_FMT_NV12?
150+
codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
151+
codecContext->pix_fmt = AV_PIX_FMT_CUDA;
152+
153+
AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
154+
TORCH_CHECK(
155+
hwFramesCtxRef != nullptr,
156+
"Failed to allocate hardware frames context for codec");
157+
158+
AVHWFramesContext* hwFramesCtx =
159+
reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
160+
hwFramesCtx->format = codecContext->pix_fmt;
161+
hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
162+
hwFramesCtx->width = codecContext->width;
163+
hwFramesCtx->height = codecContext->height;
164+
165+
int ret = av_hwframe_ctx_init(hwFramesCtxRef);
166+
if (ret < 0) {
167+
av_buffer_unref(&hwFramesCtxRef);
168+
TORCH_CHECK(
169+
false,
170+
"Failed to initialize CUDA frames context for codec: ",
171+
getFFMPEGErrorStringFromErrorCode(ret));
172+
}
173+
174+
codecContext->hw_frames_ctx = hwFramesCtxRef;
145175
}
146176

147177
UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
@@ -379,6 +409,44 @@ std::optional<const AVCodec*> CudaDeviceInterface::findDecoder(
379409
return std::nullopt;
380410
}
381411

412+
UniqueAVFrame CudaDeviceInterface::convertTensorToAVFrame(
413+
const torch::Tensor& frame,
414+
[[maybe_unused]] AVPixelFormat targetFormat,
415+
int frameIndex,
416+
AVCodecContext* codecContext) {
417+
TORCH_CHECK(frame.is_cuda(), "CUDA device interface requires CUDA tensors");
418+
TORCH_CHECK(
419+
frame.dim() == 3 && frame.size(0) == 3,
420+
"Expected 3D RGB tensor (CHW format), got shape: ",
421+
frame.sizes());
422+
423+
UniqueAVFrame avFrame(av_frame_alloc());
424+
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
425+
426+
avFrame->format = AV_PIX_FMT_CUDA;
427+
avFrame->width = static_cast<int>(frame.size(2));
428+
avFrame->height = static_cast<int>(frame.size(1));
429+
avFrame->pts = frameIndex;
430+
431+
int ret = av_hwframe_get_buffer(
432+
codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0);
433+
TORCH_CHECK(
434+
ret >= 0,
435+
"Failed to allocate hardware frame: ",
436+
getFFMPEGErrorStringFromErrorCode(ret));
437+
438+
at::cuda::CUDAStream currentStream =
439+
at::cuda::getCurrentCUDAStream(device_.index());
440+
441+
convertRGBTensorToNV12Frame(frame, avFrame, device_, nppCtx_, currentStream);
442+
443+
// Set color properties to FFmpeg defaults
444+
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
445+
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
446+
447+
return avFrame;
448+
}
449+
382450
std::string CudaDeviceInterface::getDetails() {
383451
// Note: for this interface specifically the fallback is only known after a
384452
// frame has been decoded, not before: that's when FFmpeg decides to fallback,

src/torchcodec/_core/CudaDeviceInterface.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ class CudaDeviceInterface : public DeviceInterface {
4040
FrameOutput& frameOutput,
4141
std::optional<torch::Tensor> preAllocatedOutputTensor) override;
4242

43+
UniqueAVFrame convertTensorToAVFrame(
44+
const torch::Tensor& tensor,
45+
AVPixelFormat targetFormat,
46+
int frameIndex,
47+
AVCodecContext* codecContext) override;
48+
4349
std::string getDetails() override;
4450

4551
private:

src/torchcodec/_core/DeviceInterface.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,14 @@ class DeviceInterface {
9797
FrameOutput& frameOutput,
9898
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
9999

100+
// Convert tensor to AVFrame, implemented per device interface.
101+
// This is similar to convertAVFrameToFrameOutput for encoding
102+
virtual UniqueAVFrame convertTensorToAVFrame(
103+
const torch::Tensor& tensor,
104+
AVPixelFormat targetFormat,
105+
int frameIndex,
106+
AVCodecContext* codecContext) = 0;
107+
100108
// ------------------------------------------
101109
// Extension points for custom decoding paths
102110
// ------------------------------------------

0 commit comments

Comments
 (0)