VLM: fixes after refactor (#32907)

* leave only half of the changes * fix tests * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix tests, first try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix, second try * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava * fix * [run-slow] llava, llava_next, llava_next_video, vipllava, video_llava
huggingface · Sep 10, 2024 · 7d2d6ce · 7d2d6ce
1 parent f24f084
commit 7d2d6ce
Show file tree

Hide file tree

Showing 15 changed files with 581 additions and 504 deletions.
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
@@ -476,6 +476,7 @@ def forward(
                     inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
                         image_features, inputs_embeds, input_ids, attention_mask, labels
                     )
+                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
                 else:
                     # Retrieve the first layer to inspect the logits and mask out the hidden states
                     # that are set to 0
@@ -506,6 +507,9 @@ def forward(
 
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[
+                        -target_length:
+                    ]
 
             # TODO: @raushan retain only the new behavior after v4.47
             else:
@@ -585,9 +589,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        if legacy_processing:
-            model_inputs["pixel_values"] = pixel_values
-        elif cache_position[0] == 0:
+        if legacy_processing or cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values"] = pixel_values

diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
@@ -136,6 +136,7 @@ def __call__(
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
         # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
         if image_inputs.get("pixel_values") is not None:
             if self.patch_size is not None and self.vision_feature_select_strategy is not None:
                 # Replace the image token with the expanded image token sequence
@@ -150,7 +151,6 @@ def __call__(
                     sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
                     prompt_strings.append(sample)
             else:
-                prompt_strings = text
                 logger.warning_once(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "

diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -848,6 +848,7 @@ def forward(
                         position_ids,
                         labels=labels,
                     )
+                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
                 else:
                     # Retrieve the first layer to inspect the logits and mask out the hidden states
                     # that are set to 0
@@ -877,6 +878,9 @@ def forward(
                     extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[
+                        -target_length:
+                    ]
 
             # TODO: @raushan retain only the new behavior after v4.47
             else:
@@ -956,12 +960,9 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        if legacy_processing:
-            model_inputs["pixel_values"] = pixel_values
-            model_inputs["image_sizes"] = image_sizes
-        elif cache_position[0] == 0:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model
+        if legacy_processing or cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
             model_inputs["image_sizes"] = image_sizes
 

diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
@@ -140,30 +140,29 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        if self.patch_size is None or self.vision_feature_select_strategy is None:
-            prompt_strings = text
-            logger.warning_once(
-                "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-        # cannot infer image expansion length if no images are found
-        elif not image_inputs:
-            prompt_strings = text
-        else:
-            image_sizes = image_inputs["image_sizes"]
-            height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
-            prompt_strings = []
-            for image_size, sample in zip(image_sizes, text):
-                # Replace the image token with the expanded image token sequence
-                orig_height, orig_width = image_size
-                num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
-                if self.vision_feature_select_strategy == "default":
-                    num_image_tokens -= 1
-
-                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
-                prompt_strings.append(sample)
+        prompt_strings = text
+        if image_inputs:
+            if self.patch_size is None or self.vision_feature_select_strategy is None:
+                logger.warning_once(
+                    "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                )
+            else:
+                image_sizes = iter(image_inputs["image_sizes"])
+                height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
+                prompt_strings = []
+                for sample in text:
+                    while self.image_token in sample:
+                        image_size = next(image_sizes)
+                        orig_height, orig_width = image_size
+                        num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+                        if self.vision_feature_select_strategy == "default":
+                            num_image_tokens -= 1
+                        sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
+                    prompt_strings.append(sample)
+                prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
 
         text_inputs = self.tokenizer(
             prompt_strings,