Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/lerobot/datasets/image_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def image_array_to_pil_image(image_array: np.ndarray, range_check: bool = True)
return PIL.Image.fromarray(image_array)


def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path, compress_level: int = 1):
def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path, compress_level: int = 0):
"""
Saves a NumPy array or PIL Image to a file.

Expand Down
130 changes: 87 additions & 43 deletions src/lerobot/datasets/video_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import importlib
import logging
import shutil
import subprocess
import tempfile
import warnings
from dataclasses import dataclass, field
Expand All @@ -30,7 +31,6 @@
import torch
import torchvision
from datasets.features.features import register_feature
from PIL import Image


def get_safe_default_codec():
Expand Down Expand Up @@ -312,7 +312,10 @@ def encode_video_frames(
log_level: int | None = av.logging.ERROR,
overwrite: bool = False,
) -> None:
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`.

This implementation uses direct ffmpeg commands via subprocess instead of pyav bindings.
"""
# Check encoder availability
if vcodec not in ["h264", "hevc", "libsvtav1"]:
raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.")
Expand All @@ -339,55 +342,96 @@ def encode_video_frames(
glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("-")[-1].split(".")[0])
)

# Define video output frame size (assuming all input frames are the same size)
# Check if input frames exist
if len(input_list) == 0:
raise FileNotFoundError(f"No images found in {imgs_dir}.")
with Image.open(input_list[0]) as dummy_image:
width, height = dummy_image.size

# Define video codec options
video_options = {}
# Build ffmpeg command
# Input pattern for sequential frames
input_pattern = str(imgs_dir / "frame-%06d.png")

if g is not None:
video_options["g"] = str(g)
# Start building the command
cmd = ["ffmpeg"]

if crf is not None:
video_options["crf"] = str(crf)
# Set log level for ffmpeg
ffmpeg_log_level = "quiet"
if log_level is not None:
# Map pyav log levels to ffmpeg log levels
log_level_map = {
av.logging.PANIC: "panic",
av.logging.FATAL: "fatal",
av.logging.ERROR: "error",
av.logging.WARNING: "warning",
av.logging.INFO: "info",
av.logging.VERBOSE: "verbose",
av.logging.DEBUG: "debug",
}
ffmpeg_log_level = log_level_map.get(log_level, "error")
cmd.extend(["-loglevel", ffmpeg_log_level])

# Overwrite output file if needed
if overwrite:
cmd.append("-y")
else:
cmd.append("-n")

# Input options
cmd.extend(
[
"-framerate",
str(fps),
"-i",
input_pattern,
]
)

if fast_decode:
key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
video_options[key] = value
# Video codec
cmd.extend(["-c:v", vcodec])

# Set logging level
if log_level is not None:
# "While less efficient, it is generally preferable to modify logging with Python's logging"
logging.getLogger("libav").setLevel(log_level)

# Create and open output file (overwrite by default)
with av.open(str(video_path), "w") as output:
output_stream = output.add_stream(vcodec, fps, options=video_options)
output_stream.pix_fmt = pix_fmt
output_stream.width = width
output_stream.height = height

# Loop through input frames and encode them
for input_data in input_list:
with Image.open(input_data) as input_image:
input_image = input_image.convert("RGB")
input_frame = av.VideoFrame.from_image(input_image)
packet = output_stream.encode(input_frame)
if packet:
output.mux(packet)

# Flush the encoder
packet = output_stream.encode()
if packet:
output.mux(packet)
# Pixel format
cmd.extend(["-pix_fmt", pix_fmt])

# Reset logging level
if log_level is not None:
av.logging.restore_default_callback()
# GOP size (keyframe interval)
if g is not None:
cmd.extend(["-g", str(g)])

# CRF (Constant Rate Factor) for quality
if crf is not None:
cmd.extend(["-crf", str(crf)])

# Codec-specific options
if vcodec == "libsvtav1":
# Build svtav1-params string
# preset=13 is the fastest preset
# lp=6 is the highest level of parallelism
svtav1_params = "preset=13:lp=6"
if fast_decode:
svtav1_params += f":fast-decode={fast_decode}"
cmd.extend(["-svtav1-params", svtav1_params])
elif fast_decode and vcodec in ["h264", "hevc"]:
cmd.extend(["-tune", "fastdecode"])

# Output file
cmd.append(str(video_path))

# Run ffmpeg command
try:
subprocess.run(
cmd,
check=True,
capture_output=True,
text=True,
)
except subprocess.CalledProcessError as e:
raise OSError(
f"Video encoding failed with return code {e.returncode}.\n"
f"Command: {' '.join(cmd)}\n"
f"Error output: {e.stderr}"
) from e
except FileNotFoundError:
raise OSError(
"ffmpeg command not found. Please ensure ffmpeg is installed and available in PATH."
) from None

if not video_path.exists():
raise OSError(f"Video encoding did not work. File not found: {video_path}.")
Expand Down
Loading