pytorch · Kh4L · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
@@ -67,7 +67,9 @@ jobs:
           # For the actual release we should add that label and change this to
           # include more python versions.
         python-version: ['3.9']
-        cuda-version: ['12.6', '12.8']
+        # We test against 12.6 and 12.9 to avoid having too big of a CI matrix,
+        # but for releases we should add 12.8.
+        cuda-version: ['12.6', '12.9']
         # TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325
         ffmpeg-version-for-tests: ['4.4.2', '6', '7']
 

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -224,41 +224,66 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   // Use the user-requested GPU for running the NPP kernel.
   c10::cuda::CUDAGuard deviceGuard(device_);
 
+  cudaStream_t rawStream = at::cuda::getCurrentCUDAStream().stream();
+
+  // Build an NppStreamContext, either via the old helper or by hand on
+  // CUDA 12.9+
+  NppStreamContext nppCtx{};
+#if CUDA_VERSION < 12090
+  NppStatus ctxStat = nppGetStreamContext(&nppCtx);
+  TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed");
+  // override if you want to force a particular stream
+  nppCtx.hStream = rawStream;
+#else
+  // CUDA 12.9+: helper was removed, we need to build it manually
+  int dev = 0;
+  cudaError_t err = cudaGetDevice(&dev);
+  TORCH_CHECK(err == cudaSuccess, "cudaGetDevice failed");
+  cudaDeviceProp prop{};
+  err = cudaGetDeviceProperties(&prop, dev);
+  TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed");
+
+  nppCtx.nCudaDeviceId = dev;
+  nppCtx.nMultiProcessorCount = prop.multiProcessorCount;
+  nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
+  nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
+  nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock;
+  nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
+  nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
+  nppCtx.nStreamFlags = 0;
+  nppCtx.hStream = rawStream;
+#endif
+
+  // Prepare ROI + pointers
   NppiSize oSizeROI = {width, height};
   Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};
 
   auto start = std::chrono::high_resolution_clock::now();
   NppStatus status;
+
   if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
-    status = nppiNV12ToRGB_709CSC_8u_P2C3R(
+    status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx(
         input,
         avFrame->linesize[0],
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
-        oSizeROI);
+        oSizeROI,
+        nppCtx);
   } else {
-    status = nppiNV12ToRGB_8u_P2C3R(
+    status = nppiNV12ToRGB_8u_P2C3R_Ctx(
         input,
         avFrame->linesize[0],
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
-        oSizeROI);
+        oSizeROI,
+        nppCtx);
   }
   TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
 
-  // Make the pytorch stream wait for the npp kernel to finish before using the
-  // output.
-  at::cuda::CUDAEvent nppDoneEvent;
-  at::cuda::CUDAStream nppStreamWrapper =
-      c10::cuda::getStreamFromExternal(nppGetStream(), device_.index());
-  nppDoneEvent.record(nppStreamWrapper);
-  nppDoneEvent.block(at::cuda::getCurrentCUDAStream());
-
   auto end = std::chrono::high_resolution_clock::now();
-
-  std::chrono::duration<double, std::micro> duration = end - start;
-  VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width
-          << " took: " << duration.count() << "us" << std::endl;
+  auto duration = std::chrono::duration<double, std::micro>(end - start);
+  VLOG(9) << "NPP Conversion of frame h=" << height << " w=" << width
+          << " took: " << duration.count() << "us";
 }
 
 // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9