diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d4d88ff032e1a7..bf8c20186d676e 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -836,6 +836,8 @@ title: LLaVA-NeXT - local: model_doc/llava_next_video title: LLaVa-NeXT-Video + - local: model_doc/llava_onevision + title: LLaVA-Onevision - local: model_doc/lxmert title: LXMERT - local: model_doc/matcha diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 5a16472cfd0e6b..8e3a4da8b021de 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -189,6 +189,7 @@ Flax), PyTorch, and/or TensorFlow. | [LLaVa](model_doc/llava) | ✅ | ❌ | ❌ | | [LLaVA-NeXT](model_doc/llava_next) | ✅ | ❌ | ❌ | | [LLaVa-NeXT-Video](model_doc/llava_next_video) | ✅ | ❌ | ❌ | +| [LLaVA-Onevision](model_doc/llava_onevision) | ✅ | ❌ | ❌ | | [Longformer](model_doc/longformer) | ✅ | ✅ | ❌ | | [LongT5](model_doc/longt5) | ✅ | ❌ | ✅ | | [LUKE](model_doc/luke) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md new file mode 100644 index 00000000000000..64a127abca4c28 --- /dev/null +++ b/docs/source/en/model_doc/llava_onevision.md @@ -0,0 +1,319 @@ + + +# LLaVA-Onevision + +## Overview + +The LLaVA-Onevision model was proposed in [LLaVA-OneVision: Easy Visual Task Transfer](https://arxiv.org/abs/2408.03326) by + + LLaVA=Onevision architecture. Taken from the original paper. + +Tips: + +- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating. + + + +- Llava-Onevision uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding". + + + +- Note that the model should use a specific prompt format, on which the large language model (LLM) was trained. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. + +We will use [llava-onevision-qwen2-7b-si-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-si-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: + +```python +from transformers import AutoProcessor + +processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-si-hf") + +conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What’s shown in this image?"}, + ], + }, + { + "role": "assistant", + "content": [{"type": "text", "text": "This image shows a red stop sign."},] + }, + { + + "role": "user", + "content": [ + {"type": "text", "text": "Describe the image in more details."}, + ], + }, +] + +text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + +# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images +print(text_prompt) +>>> "<|im_start|>user\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>" +``` + +This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). +The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main). + + +## Usage example + +### Single image inference + +Here's how to load the model and perform inference in half-precision (`torch.float16`): + +```python +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration +import torch +from PIL import Image +import requests + +processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") +model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) +model.to("cuda:0") + +# prepare image and text prompt, using the appropriate prompt template +url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" +image = Image.open(requests.get(url, stream=True).raw) + +conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, +] +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0", torch.float16) + +# autoregressively complete prompt +output = model.generate(**inputs, max_new_tokens=100) +print(processor.decode(output[0], skip_special_tokens=True)) +'user\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, also known as a spider chart or a star chart, which is used to compare multiple quantitative variables. Each axis represents a different variable, and the chart is filled with' +``` + +### Multi image inference + +LLaVa-Onevision can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). For that you have to use checkpoints with an "ov" suffix. Here is how you can do it: + +```python +import requests +from PIL import Image +import torch +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration + +# Load the model in half-precision +model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, device_map="auto") +processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") + +# Get three different images +url = "https://www.ilankelman.org/stopsigns/australia.jpg" +image_stop = Image.open(requests.get(url, stream=True).raw) + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image_cats = Image.open(requests.get(url, stream=True).raw) + +url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg" +image_snowman = Image.open(requests.get(url, stream=True).raw) + +# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not +conversation_1 = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "There is a red stop sign in the image."}, + ], + }, + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What about this image? How many cats do you see?"}, + ], + }, +] + +conversation_2 = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, +] + +prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True) +prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True) +prompts = [prompt_1, prompt_2] + +# We can simply feed images in the order they have to be used in the text prompt +inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16) + +# Generate +generate_ids = model.generate(**inputs, max_new_tokens=30) +processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) +['user\n\nWhat is shown in this image?\nassistant\nThere is a red stop sign in the image.\nuser\n\nWhat about this image? How many cats do you see?\nassistant\ntwo', 'user\n\nWhat is shown in this image?\nassistant\n'] +``` + +### Video inference + +LLaVa-Onevision also can perform inference with videos as input, where video frames are treated as multiple images. Here is how you can do it: + +```python +import av +import numpy as np +from huggingface_hub import hf_hub_download + +import torch +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration + +# Load the model in half-precision +model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, device_map="auto") +processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") + + +def read_video_pyav(container, indices): + ''' + Decode the video with PyAV decoder. + Args: + container (`av.container.input.InputContainer`): PyAV container. + indices (`List[int]`): List of frame indices to decode. + Returns: + result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ''' + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + +# Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos, up to 32 frames) +video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset") +container = av.open(video_path) +total_frames = container.streams.video[0].frames +indices = np.arange(0, total_frames, total_frames / 8).astype(int) +video = read_video_pyav(container, indices) + +# For videos we have to feed a "video" type instead of "image" +conversation = [ + { + + "role": "user", + "content": [ + {"type": "video"}, + {"type": "text", "text": "Why is this video funny?"}, + ], + }, +] + +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +inputs = processor(videos=list(video), text=prompt, return_tensors="pt").to("cuda:0", torch.float16) + +out = model.generate(**inputs, max_new_tokens=60) +processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) +["user\n\nWhy is this video funny?\nassistant\nThe video appears to be humorous because it shows a young child, who is wearing glasses and holding a book, seemingly reading with a serious and focused expression. The child's glasses are a bit oversized for their face, which adds a comical touch, as it's a common trope to see children wearing"] +``` + +## Model optimization + +### Quantization using Bitsandbytes + +The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with: + +```python +from transformers import LlavaOnevisionForConditionalGeneration, BitsAndBytesConfig + +# specify how to quantize the model +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, +) + +model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto") +``` + +### Use Flash-Attention 2 to further speed-up generation + +First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with: + +```python +from transformers import LlavaOnevisionForConditionalGeneration + +model = LlavaOnevisionForConditionalGeneration.from_pretrained( + model_id, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + use_flash_attention_2=True +).to(0) +``` + + +## LlavaOnevisionConfig + +[[autodoc]] LlavaOnevisionConfig + +## LlavaOnevisionProcessor + +[[autodoc]] LlavaOnevisionProcessor + +## LlavaOnevisionImageProcessor + +[[autodoc]] LlavaOnevisionImageProcessor + +## LlavaOnevisionVideoProcessor + +[[autodoc]] LlavaOnevisionVideoProcessor + +## LlavaOnevisionForConditionalGeneration + +[[autodoc]] LlavaOnevisionForConditionalGeneration + - forward diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 508886f33c6690..82bafc6b429561 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -60,6 +60,7 @@ FlashAttention-2 is currently supported for the following architectures: * [Llava](https://huggingface.co/docs/transformers/model_doc/llava) * [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next) * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video) +* [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision) * [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava) * [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava) * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100) @@ -226,6 +227,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel) * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel) * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) +* [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index bb84b5cb27d415..4f4b17ac84f1fb 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -533,6 +533,7 @@ "LlavaNextVideoConfig", "LlavaNextVideoProcessor", ], + "models.llava_onevision": ["LlavaOnevisionConfig", "LlavaOnevisionProcessor"], "models.longformer": [ "LongformerConfig", "LongformerTokenizer", @@ -1183,6 +1184,9 @@ _import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"]) _import_structure["models.llava_next"].append("LlavaNextImageProcessor") _import_structure["models.llava_next_video"].append("LlavaNextVideoImageProcessor") + _import_structure["models.llava_onevision"].extend( + ["LlavaOnevisionImageProcessor", "LlavaOnevisionVideoProcessor"] + ) _import_structure["models.mask2former"].append("Mask2FormerImageProcessor") _import_structure["models.maskformer"].extend(["MaskFormerFeatureExtractor", "MaskFormerImageProcessor"]) _import_structure["models.mobilenet_v1"].extend(["MobileNetV1FeatureExtractor", "MobileNetV1ImageProcessor"]) @@ -2532,6 +2536,12 @@ "LlavaNextVideoPreTrainedModel", ] ) + _import_structure["models.llava_onevision"].extend( + [ + "LlavaOnevisionForConditionalGeneration", + "LlavaOnevisionPreTrainedModel", + ] + ) _import_structure["models.longformer"].extend( [ "LongformerForMaskedLM", @@ -5308,6 +5318,10 @@ LlavaNextVideoConfig, LlavaNextVideoProcessor, ) + from .models.llava_onevision import ( + LlavaOnevisionConfig, + LlavaOnevisionProcessor, + ) from .models.longformer import ( LongformerConfig, LongformerTokenizer, @@ -5993,6 +6007,7 @@ from .models.levit import LevitFeatureExtractor, LevitImageProcessor from .models.llava_next import LlavaNextImageProcessor from .models.llava_next_video import LlavaNextVideoImageProcessor + from .models.llava_onevision import LlavaOnevisionImageProcessor, LlavaOnevisionVideoProcessor from .models.mask2former import Mask2FormerImageProcessor from .models.maskformer import ( MaskFormerFeatureExtractor, @@ -7113,6 +7128,10 @@ LlavaNextVideoForConditionalGeneration, LlavaNextVideoPreTrainedModel, ) + from .models.llava_onevision import ( + LlavaOnevisionForConditionalGeneration, + LlavaOnevisionPreTrainedModel, + ) from .models.longformer import ( LongformerForMaskedLM, LongformerForMultipleChoice, diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index 92756cf6e0e83c..1cb893f55a37c6 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -1063,6 +1063,7 @@ def __init__( self.batch_size = batch_size or max_batch_size self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len + # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads self.head_dim = ( config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 79105667dbe0c7..fa2b82ab4c2ba4 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1450,8 +1450,8 @@ def _get_cache( cache_dtype = self.get_output_embeddings().weight.dtype cache_kwargs = { - "config": self.config, - "batch_size": batch_size, + "config": self.config if hasattr(self.config, "text_config") else self.config, + "max_batch_size": batch_size, "max_cache_len": max_cache_len, "device": device, "dtype": cache_dtype, @@ -2353,7 +2353,11 @@ def _dola_decoding( this_peer_finished = False # prepare layers for DoLa decoding - final_layer = self.config.num_hidden_layers + final_layer = ( + self.config.text_config.num_hidden_layers + if hasattr(self.config, "text_config") + else self.config.num_hidden_layers + ) # if the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer, # as the early exit from word embeddings will become identity function # if the model is really shallow (<=2 layers), we use the 1st layer if it's not the final layer and the 0-th diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 8146d18bca4766..26b96def67d992 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -132,6 +132,7 @@ llava, llava_next, llava_next_video, + llava_onevision, longformer, longt5, luke, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 4dd553491a499e..fa1a7fb88eafa8 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -149,6 +149,7 @@ ("llava", "LlavaConfig"), ("llava_next", "LlavaNextConfig"), ("llava_next_video", "LlavaNextVideoConfig"), + ("llava_onevision", "LlavaOnevisionConfig"), ("longformer", "LongformerConfig"), ("longt5", "LongT5Config"), ("luke", "LukeConfig"), @@ -444,6 +445,7 @@ ("llava", "LLaVa"), ("llava_next", "LLaVA-NeXT"), ("llava_next_video", "LLaVa-NeXT-Video"), + ("llava_onevision", "LLaVA-Onevision"), ("longformer", "Longformer"), ("longt5", "LongT5"), ("luke", "LUKE"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 06809d371171c9..c83c43518a6a31 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -99,6 +99,7 @@ ("llava", ("CLIPImageProcessor",)), ("llava_next", ("LlavaNextImageProcessor",)), ("llava_next_video", ("LlavaNextVideoImageProcessor",)), + ("llava_onevision", ("LlavaOnevisionImageProcessor",)), ("mask2former", ("Mask2FormerImageProcessor",)), ("maskformer", ("MaskFormerImageProcessor",)), ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 334ff3820c63ed..45a9c4d0d078b7 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -314,6 +314,7 @@ ("llava", "LlavaForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), + ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), ("longformer", "LongformerForMaskedLM"), ("luke", "LukeForMaskedLM"), ("lxmert", "LxmertForPreTraining"), @@ -729,6 +730,7 @@ ("llava", "LlavaForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), + ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), ("paligemma", "PaliGemmaForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), ("qwen2_vl", "Qwen2VLForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 197e4a1ebaa036..7f49e0e8d99730 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -73,6 +73,7 @@ ("llava", "LlavaProcessor"), ("llava_next", "LlavaNextProcessor"), ("llava_next_video", "LlavaNextVideoProcessor"), + ("llava_onevision", "LlavaOnevisionProcessor"), ("markuplm", "MarkupLMProcessor"), ("mctct", "MCTCTProcessor"), ("mgp-str", "MgpstrProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index fe6778182650f3..c8eb06db04a098 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -257,6 +257,7 @@ ), ), ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("llava-onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/models/llava_onevision/__init__.py b/src/transformers/models/llava_onevision/__init__.py new file mode 100644 index 00000000000000..f16948a8f74017 --- /dev/null +++ b/src/transformers/models/llava_onevision/__init__.py @@ -0,0 +1,72 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_llava_onevision": ["LlavaOnevisionConfig"], + "processing_llava_onevision": ["LlavaOnevisionProcessor"], +} + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_llava_onevision"] = ["LlavaOnevisionImageProcessor"] + + _import_structure["video_processing_llava_onevision"] = ["LlavaOnevisionVideoProcessor"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_llava_onevision"] = [ + "LlavaOnevisionForConditionalGeneration", + "LlavaOnevisionPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_llava_onevision import LlavaOnevisionConfig + from .processing_llava_onevision import LlavaOnevisionProcessor + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_llava_onevision import LlavaOnevisionImageProcessor + from .video_processing_llava_onevision import LlavaOnevisionVideoProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_llava_onevision import ( + LlavaOnevisionForConditionalGeneration, + LlavaOnevisionPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py new file mode 100644 index 00000000000000..eef86c6c8c019b --- /dev/null +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -0,0 +1,183 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...configuration_utils import PretrainedConfig +from ...utils import ( + logging, +) +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class LlavaOnevisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlavaOnevisionForConditionalGeneration`]. It is used to instantiate an + Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf) + model. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`): + The config object or dictionary of the text backbone. + image_token_index (`int`, *optional*, defaults to 151646): + The image token index to encode the image prompt. + video_token_index (`int`, *optional*, defaults to 151647): + The video token index to encode the video prompt. + projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The activation function used by the multimodal projector. + vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`): + Aspect ratio used when processong image features. The default value is "anyres_max_9". + image_grid_pinpoints (`List`, *optional*): + A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list + of the form `(height, width)`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + + Example: + + ```python + >>> from transformers import LlavaOnevisionForConditionalGeneration, LlavaOnevisionConfig, SiglipVisionConfig, Qwen2Config + + >>> # Initializing a CLIP-vision config + >>> vision_config = SiglipVisionConfig() + + >>> # Initializing a Llama config + >>> text_config = Qwen2Config() + + >>> # Initializing a Llava-Next llava-hf/llava-onevision-qwen2-7b-ov-hf style configuration + >>> configuration = LlavaOnevisionConfig(vision_config, text_config) + + >>> # Initializing a model from the llava-hf/llava-onevision-qwen2-7b-ov-hf style configuration + >>> model = LlavaOnevisionForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llava_onevision" + is_composition = False + + def __init__( + self, + vision_config=None, + text_config=None, + image_token_index=151646, + video_token_index=151647, + projector_hidden_act="gelu", + vision_feature_select_strategy="full", + vision_feature_layer=-1, + vision_aspect_ratio="anyres_max_9", + image_grid_pinpoints=None, + tie_word_embeddings=False, + **kwargs, + ): + self.image_token_index = image_token_index + self.video_token_index = video_token_index + self.projector_hidden_act = projector_hidden_act + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError( + "vision_feature_select_strategy should be one of 'default', 'full'." + f"Got: {vision_feature_select_strategy}" + ) + + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.vision_aspect_ratio = vision_aspect_ratio + image_grid_pinpoints = ( + image_grid_pinpoints + if image_grid_pinpoints is not None + else [ + [384, 384], + [384, 768], + [384, 1152], + [384, 1536], + [384, 1920], + [384, 2304], + [768, 384], + [768, 768], + [768, 1152], + [768, 1536], + [768, 1920], + [768, 2304], + [1152, 384], + [1152, 768], + [1152, 1152], + [1152, 1536], + [1152, 1920], + [1152, 2304], + [1536, 384], + [1536, 768], + [1536, 1152], + [1536, 1536], + [1536, 1920], + [1536, 2304], + [1920, 384], + [1920, 768], + [1920, 1152], + [1920, 1536], + [1920, 1920], + [1920, 2304], + [2304, 384], + [2304, 768], + [2304, 1152], + [2304, 1536], + [2304, 1920], + [2304, 2304], + ] + ) + self.image_grid_pinpoints = image_grid_pinpoints + + if isinstance(vision_config, dict): + vision_config["model_type"] = ( + vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model" + ) + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["siglip_vision_model"]( + hidden_size=1152, + intermediate_size=4304, + patch_size=14, + image_size=384, + num_hidden_layers=26, + num_attention_heads=14, + vision_use_head=False, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "qwen2" + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["qwen2"]() + + self.text_config = text_config + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py new file mode 100644 index 00000000000000..e8d51f99e67f32 --- /dev/null +++ b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py @@ -0,0 +1,360 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Convert LLaVa-Onevision checkpoints from the original repository. + +URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main + +""" + +import argparse +import gc +import glob +import json +from pathlib import Path + +import requests +import torch +from accelerate import init_empty_weights +from huggingface_hub import hf_hub_download, snapshot_download +from PIL import Image +from safetensors import safe_open + +from transformers import ( + AddedToken, + AutoConfig, + AutoTokenizer, + LlavaOnevisionConfig, + LlavaOnevisionForConditionalGeneration, + LlavaOnevisionImageProcessor, + LlavaOnevisionProcessor, + LlavaOnevisionVideoProcessor, + SiglipVisionConfig, +) + + +KEYS_TO_MODIFY_MAPPING = { + "model.vision_tower.": "", + "model.mm_projector": "multi_modal_projector", + "model": "model.model", + "vision_model.model": "vision_model", + "lm_head": "language_model.lm_head", + "model.model": "language_model.model", + "multi_modal_projector.0": "multi_modal_projector.linear_1", + "multi_modal_projector.2": "multi_modal_projector.linear_2", + "language_model.model.image_newline": "image_newline", +} + +chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '\n' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '