From c55f0519244063525532f3e560b158387dcb9966 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Wed, 18 Sep 2024 05:24:42 +0800 Subject: [PATCH] =?UTF-8?q?fix=20patch=5Fattention=5Fmask=20incorrect=20se?= =?UTF-8?q?tting=20which=20leads=20to=20the=20differe=E2=80=A6=20(#33499)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix patch_attention_mask incorrect setting which leads to the difference in the generated text if batch > 1 Signed-off-by: Wang, Yi * fix format Signed-off-by: Wang, Yi * [run_slow] idefics2 --------- Signed-off-by: Wang, Yi --- .../models/idefics2/modeling_idefics2.py | 2 +- .../models/idefics2/test_modeling_idefics2.py | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 08ada424ea77b4..6108f0e8a42e8f 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1388,7 +1388,7 @@ def forward( patch_size = self.config.vision_config.patch_size patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size) patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) - patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) == patch_size * patch_size).bool() # Get sequence from the vision encoder image_hidden_states = self.vision_model( diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 8d48acb9500d2f..e02c5b4c9f09c6 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -540,6 +540,41 @@ def test_integration_test_4bit(self): expected_generated_text = "In this image, we see the Statue of Liberty, the Hudson River," self.assertEqual(generated_texts[0], expected_generated_text) + @slow + @require_bitsandbytes + def test_integration_test_4bit_batch2(self): + # Let' s make sure we test the preprocessing to replace what is used + + model = Idefics2ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/idefics2-8b-base", + load_in_4bit=True, + ) + + from datasets import load_dataset + + dataset = load_dataset("nielsr/docvqa_1200_examples", split="test") + + text = [f"{dataset[40]['query']['en']}", f"{dataset[41]['query']['en']}"] + images = [[dataset[40]["image"]], [dataset[41]["image"]]] + inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt") + generated_ids = model.generate(**inputs, max_new_tokens=64) + batched_generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + text = f"{dataset[40]['query']['en']}" + images = dataset[40]["image"] + inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt") + generated_ids = model.generate(**inputs, max_new_tokens=64) + generated_text_0 = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + text = f"{dataset[41]['query']['en']}" + images = dataset[41]["image"] + inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt") + generated_ids = model.generate(**inputs, max_new_tokens=64) + generated_text_1 = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + self.assertEqual(batched_generated_texts[0], generated_text_0[0]) + self.assertEqual(batched_generated_texts[1], generated_text_1[0]) + @require_flash_attn @require_torch_gpu @require_bitsandbytes