Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use PyAV instead of Decord in examples #21572

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docker/transformers-all-latest-gpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/acc
# Add bitsandbytes for mixed int8 testing
RUN python3 -m pip install --no-cache-dir bitsandbytes

RUN python3 -m pip install --no-cache-dir decord
# For video model testing
RUN python3 -m pip install --no-cache-dir decord av==9.2.0
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not removing decord yet as it's used in some other areas e.g. video classification pipeline


# For `dinat` model
RUN python3 -m pip install --no-cache-dir natten -f https://shi-labs.com/natten/wheels/$CUDA/
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
_deps = [
"Pillow",
"accelerate>=0.10.0",
"av==9.2.0", # Latest version of PyAV (10.0.0) has issues with audio stream.
"beautifulsoup4",
"black~=23.1",
"codecarbon==1.2.0",
Expand Down Expand Up @@ -289,7 +290,7 @@ def run(self):
extras["torch-vision"] = deps_list("torchvision") + extras["vision"]
extras["natten"] = deps_list("natten")
extras["codecarbon"] = deps_list("codecarbon")
extras["video"] = deps_list("decord")
extras["video"] = deps_list("decord", "av")

extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
extras["testing"] = (
Expand Down
1 change: 1 addition & 0 deletions src/transformers/dependency_versions_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
deps = {
"Pillow": "Pillow",
"accelerate": "accelerate>=0.10.0",
"av": "av==9.2.0",
"beautifulsoup4": "beautifulsoup4",
"black": "black~=23.1",
"codecarbon": "codecarbon==1.2.0",
Expand Down
43 changes: 30 additions & 13 deletions src/transformers/models/git/modeling_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -1425,11 +1425,11 @@ def forward(
Video captioning example:

```python
>>> from transformers import AutoProcessor, AutoModelForCausalLM
>>> from PIL import Image
>>> import av
>>> import numpy as np
>>> from PIL import Image
>>> from huggingface_hub import hf_hub_download
>>> from decord import VideoReader, cpu
>>> from transformers import AutoProcessor, AutoModelForCausalLM

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
Expand All @@ -1438,6 +1438,27 @@ def forward(
>>> np.random.seed(45)


>>> def read_video_pyav(container, indices):
... '''
... Decode the video with PyAV decoder.
... Args:
... container (`av.container.input.InputContainer`): PyAV container.
... indices (`List[int]`): List of frame indices to decode.
... Returns:
... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
... '''
... frames = []
... container.seek(0)
... start_index = indices[0]
... end_index = indices[-1]
... for i, frame in enumerate(container.decode(video=0)):
... if i > end_index:
... break
... if i >= start_index and i in indices:
... frames.append(frame)
... return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
Expand All @@ -1447,24 +1468,20 @@ def forward(
... return indices


>>> def sample_frames(file_path, num_frames):
... videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
... videoreader.seek(0)
... indices = sample_frame_indices(clip_len=num_frames, frame_sample_rate=4, seg_len=len(videoreader))
... frames = videoreader.get_batch(indices).asnumpy()
... return list(frames)


>>> # load video
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample frames
>>> num_frames = model.config.num_image_with_embedding
>>> frames = sample_frames(file_path, num_frames)
>>> indices = sample_frame_indices(
... clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
... )
>>> frames = read_video_pyav(container, indices)

>>> pixel_values = processor(images=frames, return_tensors="pt").pixel_values
>>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

>>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

Expand Down
76 changes: 59 additions & 17 deletions src/transformers/models/timesformer/modeling_timesformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,12 +570,35 @@ def forward(
Examples:

```python
>>> from decord import VideoReader, cpu
>>> import av
>>> import numpy as np

>>> from transformers import AutoFeatureExtractor, TimesformerModel
>>> from transformers import AutoImageProcessor, TimesformerModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
... '''
... Decode the video with PyAV decoder.
... Args:
... container (`av.container.input.InputContainer`): PyAV container.
... indices (`List[int]`): List of frame indices to decode.
... Returns:
... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
... '''
... frames = []
... container.seek(0)
... start_index = indices[0]
... end_index = indices[-1]
... for i, frame in enumerate(container.decode(video=0)):
... if i > end_index:
... break
... if i >= start_index and i in indices:
... frames.append(frame)
... return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
Expand All @@ -590,24 +613,23 @@ def forward(
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> container = av.open(file_path)

>>> # sample 8 frames
>>> videoreader.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=4, seg_len=len(videoreader))
>>> video = videoreader.get_batch(indices).asnumpy()
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)

>>> feature_extractor = AutoFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k400")

>>> # prepare video for the model
>>> inputs = feature_extractor(list(video), return_tensors="pt")
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 1568, 768]
[1, 1569, 768]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This lost me a lot of time 😅

  • These doc tests weren't being tested as they weren't in utils/documentation_tests.txt
  • Testing on the huggingface/transformers-all-latest-gpu:latest image on main I get the same last_hidden_state shape -- [1, 1569, 768] -- as with the pyav change.

```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
Expand Down Expand Up @@ -676,16 +698,37 @@ def forward(
Examples:

```python
>>> from decord import VideoReader, cpu
>>> import av
>>> import torch
>>> import numpy as np

>>> from transformers import AutoFeatureExtractor, TimesformerForVideoClassification
>>> from transformers import AutoImageProcessor, TimesformerForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
... '''
... Decode the video with PyAV decoder.
... Args:
... container (`av.container.input.InputContainer`): PyAV container.
... indices (`List[int]`): List of frame indices to decode.
... Returns:
... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
... '''
... frames = []
... container.seek(0)
... start_index = indices[0]
... end_index = indices[-1]
... for i, frame in enumerate(container.decode(video=0)):
... if i > end_index:
... break
... if i >= start_index and i in indices:
... frames.append(frame)
... return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
Expand All @@ -699,17 +742,16 @@ def forward(
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> container = av.open(file_path)

>>> # sample 8 frames
>>> videoreader.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=4, seg_len=len(videoreader))
>>> video = videoreader.get_batch(indices).asnumpy()
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)

>>> feature_extractor = AutoFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")

>>> inputs = feature_extractor(list(video), return_tensors="pt")
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> with torch.no_grad():
... outputs = model(**inputs)
Expand Down
62 changes: 52 additions & 10 deletions src/transformers/models/videomae/modeling_videomae.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,12 +576,35 @@ def forward(
Examples:

```python
>>> from decord import VideoReader, cpu
>>> import av
>>> import numpy as np

>>> from transformers import AutoImageProcessor, VideoMAEModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
... '''
... Decode the video with PyAV decoder.
... Args:
... container (`av.container.input.InputContainer`): PyAV container.
... indices (`List[int]`): List of frame indices to decode.
... Returns:
... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
... '''
... frames = []
... container.seek(0)
... start_index = indices[0]
... end_index = indices[-1]
... for i, frame in enumerate(container.decode(video=0)):
... if i > end_index:
... break
... if i >= start_index and i in indices:
... frames.append(frame)
... return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
Expand All @@ -596,12 +619,11 @@ def forward(
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> container = av.open(file_path)

>>> # sample 16 frames
>>> videoreader.seek(0)
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader))
>>> video = videoreader.get_batch(indices).asnumpy()
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)

>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")
Expand Down Expand Up @@ -944,7 +966,7 @@ def forward(
Examples:

```python
>>> from decord import VideoReader, cpu
>>> import av
>>> import torch
>>> import numpy as np

Expand All @@ -954,6 +976,27 @@ def forward(
>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
... '''
... Decode the video with PyAV decoder.
... Args:
... container (`av.container.input.InputContainer`): PyAV container.
... indices (`List[int]`): List of frame indices to decode.
... Returns:
... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
... '''
... frames = []
... container.seek(0)
... start_index = indices[0]
... end_index = indices[-1]
... for i, frame in enumerate(container.decode(video=0)):
... if i > end_index:
... break
... if i >= start_index and i in indices:
... frames.append(frame)
... return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
Expand All @@ -967,12 +1010,11 @@ def forward(
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> container = av.open(file_path)

>>> # sample 16 frames
>>> videoreader.seek(0)
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader))
>>> video = videoreader.get_batch(indices).asnumpy()
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)

>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
Expand Down
Loading