[torchcodec] Improve benchmark to test all combinations

ahmadsharif1 · facebook-github-bot · commit 1025e82f781b · 2024-08-05T15:46:22.000-07:00
Summary: Test all combinations of CPU+GPU for decode+preproc.

Reviewed By: kaiyuey

Differential Revision: D60692261

fbshipit-source-id: 791d7530fd9a3b10ee614c3744663ae7c95c057c
diff --git a/benchmarks/decoders/gpu_benchmark.py b/benchmarks/decoders/gpu_benchmark.py
@@ -8,31 +8,57 @@
 import torchcodec
 from torchvision.transforms import Resize
 
+RESIZED_WIDTH = 256
+RESIZED_HEIGHT = 256
 
-def transfer_and_resize_frame(frame):
+
+def transfer_and_resize_frame(frame, resize_device_string):
     # This should be a no-op if the frame is already on the GPU.
-    frame = frame.to("cuda:0")
-    frame = Resize((256, 256))(frame)
+    frame = frame.to(resize_device_string)
+    frame = Resize((RESIZED_HEIGHT, RESIZED_WIDTH))(frame)
     return frame
 
 
-def decode_full_video(video_path, device_string, do_gpu_preproc):
-    decoder = torchcodec.decoders.SimpleVideoDecoder(
-        video_path, device=torch.device(device_string)
+def decode_full_video(video_path, decode_device_string, resize_device_string):
+    # We use the core API instead of SimpleVideoDecoder because the core API
+    # allows us to natively resize as part of the decode step.
+    print(f"{decode_device_string=} {resize_device_string=}")
+    decoder = torchcodec.decoders._core.create_from_file(video_path)
+    num_threads = None
+    if "cuda" in decode_device_string:
+        num_threads = 1
+    width = None
+    height = None
+    if "native" in resize_device_string:
+        width = RESIZED_WIDTH
+        height = RESIZED_HEIGHT
+    torchcodec.decoders._core.add_video_stream(
+        decoder,
+        stream_index=-1,
+        device_string=decode_device_string,
+        num_threads=num_threads,
+        width=width,
+        height=height,
     )
+
     start_time = time.time()
     frame_count = 0
-    for frame in decoder:
-        # You can do a resize to simulate extra preproc work that happens
-        # on the GPU by uncommenting the following line:
-        if do_gpu_preproc:
-            frame = transfer_and_resize_frame(frame)
-        frame_count += 1
+    while True:
+        try:
+            frame, *_ = torchcodec.decoders._core.get_next_frame(decoder)
+            if resize_device_string != "none" and "native" not in resize_device_string:
+                frame = transfer_and_resize_frame(frame, resize_device_string)
+
+            frame_count += 1
+        except Exception as e:
+            print("EXCEPTION", e)
+            break
+
     end_time = time.time()
     elapsed = end_time - start_time
     fps = frame_count / (end_time - start_time)
     print(
-        f"****** DECODED full video {device_string=} {frame_count=} {elapsed=} {fps=}"
+        f"****** DECODED full video {decode_device_string=} {frame_count=} {elapsed=} {fps=}"
     )
     return frame_count, end_time - start_time
 
@@ -45,6 +71,12 @@ def main():
         type=str,
         help="Comma-separated devices to test decoding on.",
     )
+    parser.add_argument(
+        "--resize_devices",
+        default="cuda:0,cpu,native,none",
+        type=str,
+        help="Comma-separated devices to test preroc (resize) on. Use 'none' to specify no resize.",
+    )
     parser.add_argument(
         "--video",
         type=str,
@@ -60,15 +92,6 @@ def main():
             "to measure the cold start time."
         ),
     )
-    parser.add_argument(
-        "--do_gpu_preproc",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help=(
-            "Do a transfer to GPU and resize operation after the decode to "
-            "simulate a real-world transform."
-        ),
-    )
     args = parser.parse_args()
     video_path = args.video
 
@@ -78,29 +101,44 @@ def main():
             decode_full_video(video_path, device)
         return
 
-    label = "Decode"
-    if args.do_gpu_preproc:
-        label += " + GPU Preproc"
-    label += " Time"
+    resize_devices = args.resize_devices.split(",")
+    resize_devices = [d for d in resize_devices if d != ""]
+    if len(resize_devices) == 0:
+        resize_devices.append("none")
+
+    label = "Decode+Resize Time"
 
     results = []
-    for device in args.devices.split(","):
-        print("device", device)
-        t = benchmark.Timer(
-            stmt="decode_full_video(video_path, device, do_gpu_preproc)",
-            globals={
-                "device": device,
-                "video_path": video_path,
-                "decode_full_video": decode_full_video,
-                "do_gpu_preproc": args.do_gpu_preproc,
-            },
-            label=label,
-            sub_label=f"video={os.path.basename(video_path)}",
-            description=f"decode_device={device}",
-        ).blocked_autorange()
-        results.append(t)
+    for decode_device_string in args.devices.split(","):
+        for resize_device_string in resize_devices:
+            decode_label = decode_device_string
+            if "cuda" in decode_label:
+                # Shorten "cuda:0" to "cuda"
+                decode_label = "cuda"
+            resize_label = resize_device_string
+            if "cuda" in resize_device_string:
+                # Shorten "cuda:0" to "cuda"
+                resize_label = "cuda"
+            print("decode_device", decode_device_string)
+            print("resize_device", resize_device_string)
+            t = benchmark.Timer(
+                stmt="decode_full_video(video_path, decode_device_string, resize_device_string)",
+                globals={
+                    "decode_device_string": decode_device_string,
+                    "video_path": video_path,
+                    "decode_full_video": decode_full_video,
+                    "resize_device_string": resize_device_string,
+                },
+                label=label,
+                sub_label=f"video={os.path.basename(video_path)}",
+                description=f"D={decode_label},R={resize_label}",
+            ).blocked_autorange()
+            results.append(t)
     compare = benchmark.Compare(results)
     compare.print()
+    print("Key: D=Decode, R=Resize")
+    print("Native resize is done as part of the decode step")
+    print("none resize means there is no resize step -- native or otherwise")
 
 
 if __name__ == "__main__":