merge main

huggingface · zucchini-nlp · Aug 13, 2024 · May 20, 2024 · May 22, 2024 · May 22, 2024
commit a6e380fe495b4350c1a70f95f5292ece1326fe10
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
@@ -128,20 +128,20 @@ def __call__(
         if images is not None:
             image_inputs = self.image_processor(images, return_tensors=return_tensors)
         else:
-            pixel_values = None
+            image_inputs = {}
 
         if isinstance(text, str):
             text = [text]
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
         # try to expand inputs in processing if we have the necessary parts
-        if (
-            pixel_values is not None
-            and self.patch_size is not None
+        if image_inputs.get("pixel_values") is not None:
+            if (self.patch_size is not None
             and self.vision_feature_select_strategy is not None
         ):
             # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
             height, width = get_image_size(to_numpy_array(pixel_values[0]))
             num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
             if self.vision_feature_select_strategy == "default":
@@ -151,14 +151,14 @@ def __call__(
             for sample in text:
                 sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
                 prompt_strings.append(sample)
-        elif pixel_values is not None:
-            prompt_strings = text
-            logger.warning_once(
-                "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
-            )
+            else:
+                prompt_strings = text
+                logger.warning_once(
+                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                )
 
         text_inputs = self.tokenizer(
             prompt_strings,
@@ -167,7 +167,7 @@ def __call__(
             truncation=truncation,
             max_length=max_length,
         )
-        return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
+        return BatchFeature(data={**text_inputs, **image_inputs})
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):

diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
@@ -501,6 +501,19 @@ def test_tokenizer_integration(self):
         self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
         self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
 
+    @slow
+    @require_bitsandbytes
+    def test_generation_no_images(self):
+        model_id = "llava-hf/llava-1.5-7b-hf"
+        model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        # Prepare inputs with no images
+        inputs = processor("Hello, I am", return_tensors="pt").to(torch_device)
+
+        # Make sure that `generate` works
+        _ = model.generate(**inputs, max_new_tokens=20)
+
     @slow
     @require_bitsandbytes
     def test_expansion_in_processing(self):

diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
@@ -516,7 +516,40 @@ def test_small_model_integration_test_batch_matches_single(self):
 
     @slow
     @require_bitsandbytes
-    def test_expansion_in_processing(self):
+    def test_padding_side_when_merging_inputs(self):
+        model = LlavaNextForConditionalGeneration.from_pretrained(
+            "llava-hf/llava-v1.6-mistral-7b-hf",
+            load_in_4bit=True,
+        )
+
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e"
+        cats_image = Image.open(requests.get(url, stream=True).raw)
+        lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
+
+        inputs_batched = self.processor(
+            [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
+        ).to(torch_device)
+
+        # model is in eval mode by default so we should get pad on the left side
+        # we can check the first hidden-states (aka inputs embeds)
+        # the first element was lo-res image and we expect the first 1414 tokens to be all pads
+        output_eval = model(**inputs_batched, output_hidden_states=True)
+        self.assertTrue((output_eval.hidden_states[0][0, :1414, ...] == 0).all().item())
+
+        # otherwise padding is on the right side, so it's last 1414 tokens
+        self.processor.padding_side = "right"
+        inputs_batched = self.processor(
+            [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
+        ).to(torch_device)
+
+        model.train()
+        with torch.no_grad():
+            output_train = model(**inputs_batched, output_hidden_states=True)
+        self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())
+
+    @slow
+    @require_bitsandbytesdef test_expansion_in_processing(self):
         model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
         model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
         processor = AutoProcessor.from_pretrained(model_id)

diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -452,6 +452,42 @@ def test_small_model_integration_test_batch_matches_single(self):
             self.processor.decode(output_single[0], skip_special_tokens=True),
         )
 
+    @slow
+    @require_bitsandbytes
+    def test_padding_side_when_merging_inputs(self):
+        model = LlavaNextVideoForConditionalGeneration.from_pretrained(
+            "llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
+        )
+
+        inputs_batched = self.processor(
+            [self.prompt_video, self.prompt_image],
+            images=[self.image],
+            videos=[self.video],
+            return_tensors="pt",
+            padding=True,
+        ).to(torch_device)
+
+        # model is in eval mode by default so we should get pad on the left side
+        # we can check the first hidden-states (aka inputs embeds)
+        # the first element was lo-res image and we expect the first 1482 tokens to be all pads
+        output_eval = model(**inputs_batched, output_hidden_states=True)
+        self.assertTrue((output_eval.hidden_states[0][0, :1482, ...] == 0).all().item())
+
+        # otherwise padding is on the right side, so it's last 1482 tokens
+        self.processor.padding_side = "right"
+        inputs_batched = self.processor(
+            [self.prompt_video, self.prompt_image],
+            images=[self.image],
+            videos=[self.video],
+            return_tensors="pt",
+            padding=True,
+        ).to(torch_device)
+
+        model.train()
+        with torch.no_grad():
+            output_train = model(**inputs_batched, output_hidden_states=True)
+        self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item())
+
     @slow
     @require_bitsandbytes
     def test_expansion_in_processing(self):