huggingface
diff --git a/‎tests/models/cohere2_vision/test_modeling_cohere2_vision.py‎
Lines changed: 7 additions & 1 deletion b/‎tests/models/cohere2_vision/test_modeling_cohere2_vision.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎tests/models/ernie4_5/test_modeling_ernie4_5.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/models/ernie4_5/test_modeling_ernie4_5.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/internvl/test_modeling_internvl.py‎
Lines changed: 4 additions & 3 deletions b/‎tests/models/internvl/test_modeling_internvl.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎tests/models/llava/test_modeling_llava.py‎
Lines changed: 37 additions & 6 deletions b/‎tests/models/llava/test_modeling_llava.py‎
Lines changed: 37 additions & 6 deletions
@@ -223,7 +223,7 @@ def test_model_integration_forward(self):
 
         EXPECTED_LOGITS = Expectations(
             {
-                ("xpu", 3): [0.4109, 0.1532, 0.8018, 2.1328, 0.5483],
+                ("xpu", 3): [2.4297, 1.6836, 1.8779, 2.1895, 1.9395],
                 # 4-bit
                 ("cuda", 7): [0.1097, 0.3481, 3.8340, 9.7969, 2.0488],
                 ("cuda", 8): [2.4277, 1.6875, 1.8789, 2.1875, 1.9375],
@@ -264,6 +264,7 @@ def test_model_integration_generate_text_only(self):
 
         expected_outputs = Expectations(
             {
+                ("xpu", 3): "<|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|>",
                 ("cuda", 8): "<|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|>",
             }
         )  # fmt: skip
@@ -298,6 +299,7 @@ def test_model_integration_generate_chat_template(self):
 
         expected_outputs = Expectations(
             {
+                ("xpu", 3): '<|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|>',
                 ("cuda", 8): '<|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|>',
             }
         )  # fmt: skip
@@ -344,6 +346,7 @@ def test_model_integration_batched_generate(self):
         decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
         expected_outputs = Expectations(
             {
+                ("xpu", 3): 'Dock stretches to calm',
                 ("cuda", 8): 'Dock stretches to calm',
             }
         )  # fmt: skip
@@ -360,6 +363,7 @@ def test_model_integration_batched_generate(self):
 
         expected_outputs = Expectations(
             {
+                ("xpu", 3): 'The image depicts a',
                 ("cuda", 8): 'The image depicts a',
             }
         )  # fmt: skip
@@ -418,6 +422,7 @@ def test_model_integration_batched_generate_multi_image(self):
         # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
         expected_outputs = Expectations(
             {
+                ("xpu", 3): '<|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|>',
                 ("cuda", 8): '<|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|>',
             }
         )  # fmt: skip
@@ -433,6 +438,7 @@ def test_model_integration_batched_generate_multi_image(self):
         decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
         expected_outputs = Expectations(
             {
+                ("xpu", 3): '<|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|>',
                 ("cuda", 8): '<|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|><|CHATBOT_TOKEN|>',
             }
         )  # fmt: skip
 
@@ -79,6 +79,7 @@ def test_ernie4_5_0p3B(self):
         """
         expected_texts = Expectations(
             {
+                ("xpu", 3): "User: Hey, are you conscious? Can you talk to me?\nAssistant: Hey! I'm here to help you with whatever you need. Are you feeling a bit overwhelmed or stressed? I'm here to listen and provide support.",
                 ("cuda", None): "User: Hey, are you conscious? Can you talk to me?\nAssistant: Hey! I'm here to help you with whatever you need. Are you feeling a bit overwhelmed or stressed? I'm here to listen and provide support.",
             }
         )  # fmt: skip
 
@@ -645,7 +645,7 @@ def test_llama_small_model_integration_forward(self):
 
         expected_logits_all = Expectations(
             {
-                ("xpu", 3): [-9.8750, -0.5703, 1.4297, -10.3125, -10.3125],
+                ("xpu", 3): [-9.8828,  -0.4954,   1.4561, -10.3438, -10.3438],
                 ("cuda", 7): [-9.8750,  -0.4861,   1.4648, -10.3359, -10.3359],
                 ("cuda", 8): [-9.8906,  -0.4995,   1.4473, -10.3359, -10.3438],
                 ("rocm", (9, 4)): [ -9.8828,  -0.5005,   1.4697, -10.3438, -10.3438],
@@ -680,6 +680,7 @@ def test_llama_small_model_integration_generate_text_only(self):
 
         expected_outputs = Expectations(
             {
+                ("xpu", 3): "Autumn leaves fall,\nNature's breath, a season's sigh,\nSilent woods awake.",
                 ("cuda", 7): "Autumn leaves fall,\nNature's breath, a gentle sigh,\nSilent whispers.",
                 ("cuda", 8): "Autumn leaves fall,\nNature's breath, a silent sigh,\nWinter's chill approaches.",
             }
@@ -920,7 +921,7 @@ def test_llama_small_model_integration_interleaved_images_videos(self):
         # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
         expected_outputs = Expectations(
             {
-                ("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
+                ("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **",
                 ("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
                 ("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
                 ("rocm", (9, 4)): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
@@ -938,7 +939,7 @@ def test_llama_small_model_integration_interleaved_images_videos(self):
         decoded_output = processor.decode(output[1], skip_special_tokens=True)
         expected_outputs = Expectations(
             {
-                ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
+                ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their",
                 ("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
                 ("cuda", 8): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
             }
 
@@ -300,6 +300,7 @@ def test_small_model_integration_test(self):
 
         output = model.generate(**inputs, max_new_tokens=20)
         expected_decoded_texts = Expectations({
+            ("xpu", 3): "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,",
             ("cuda", None): "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,",
             ("rocm", (9, 5)): "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. First, the",
         })  # fmt: skip
@@ -328,17 +329,16 @@ def test_small_model_integration_test_llama_single(self):
 
         EXPECTED_DECODED_TEXTS = Expectations(
             {
+                ("xpu", 3): 'USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Lastly, be respectful of the environment and other visitors, as the pier is a shared space where people can enjoy the view, relax, or engage in recreational activities.',
                 ("cuda", 7): 'USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Lastly, be respectful of the environment and other visitors, as the pier is a shared space where people can enjoy the view, relax, or engage in recreational activities.',
                 ("cuda", 8): 'USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Lastly, be respectful of the environment and other visitors, as the pier is a shared space where people can enjoy the view, relax, or engage in recreational activities.',
                 ("rocm", (9, 5)): 'USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock overlooking a lake, you should be cautious about the following:\n\n1. Safety: Ensure that the pier or dock is stable and secure before stepping onto it. Avoid walking on the edge of the pier or dock, as it could be unstable or unsafe.\n\n2. Weather conditions: Be aware of the weather forecast before visiting the area. Strong winds, heavy rain, or storms can make the pier or dock unsafe to use.\n\n3. Wildlife: Be mindful of the wildlife in the area, such as birds or aquatic animals. Avoid disturbing their natural habitat or causing harm to the local ecosystem.\n\n4. Water safety: If you plan to go swimming or engage in water activities, be aware of the water conditions, such as currents, tides, or potential hazards like submerged objects.\n\n5. Personal belongings: Keep an eye on your personal belongings, such as bags or backpacks, to prevent theft or loss.\n\n6. Leave no trace: When visiting the area, make sure to clean up after yourself and leave no trace of your presence to preserve the natural environment.',
             }
         )  # fmt: skip
         EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation()
+        decoded_text = processor.decode(output[0], skip_special_tokens=True)
 
-        self.assertEqual(
-            processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
+        self.assertEqual(decoded_text, EXPECTED_DECODED_TEXT)
 
     @slow
     @require_bitsandbytes
@@ -362,6 +362,13 @@ def test_small_model_integration_test_llama_batched(self):
 
         expected_decoded_texts = Expectations(
             {
+                ("xpu", 3): [
+                    "USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring "
+                    "with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, "
+                    "you",
+                    "USER:  \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat "
+                    "is located on",
+                ],
                 ("cuda", None): [
                     "USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring "
                     "with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, "
@@ -404,6 +411,10 @@ def test_small_model_integration_test_batch(self):
 
         EXPECTED_DECODED_TEXTS = Expectations(
             {
+                ("xpu", 3): [
+                    'USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring along',
+                    'USER:  \nWhat is this?\nASSISTANT: Cats',
+                ],
                 ("cuda", 7): [
                     'USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring along',
                     'USER:  \nWhat is this?\nASSISTANT: Cats',
@@ -452,6 +463,13 @@ def test_small_model_integration_test_llama_batched_regression(self):
 
         expected_decoded_texts = Expectations(
             {
+                ("xpu", 3): [
+                    "USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring "
+                    "with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a "
+                    "body of water",
+                    "USER:  \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER:  \nAnd this?\nASSISTANT: A cat "
+                    "sleeping on a bed.",
+                ],
                 ("cuda", None): [
                     "USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring "
                     "with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a "
@@ -501,6 +519,11 @@ def test_batched_generation(self):
 
         EXPECTED_OUTPUTS = Expectations(
             {
+                ("xpu", 3): [
+                    "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
+                    '\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small',
+                    '\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the'
+                ],
                 ("cuda", 7): [
                     "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one of them has a dog standing on a field, while",
                     "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
@@ -573,8 +596,16 @@ def test_generation_siglip_backbone(self):
         # Make sure that `generate` works
         output = model.generate(**inputs, max_new_tokens=30)
 
-        EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
-        self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
+        EXPECTED_DECODED_TEXTS = Expectations(
+            {
+                ("xpu", 3): "user\n\nWhat are these?\nassistant These are two cats, one with a green collar and the other with a black collar. They are lying on a pink blanket and appear to be sleeping",
+                ("cuda", None): "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat",
+            }
+        )  # fmt: skip
+        EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation()
+
+        decoded_text = processor.batch_decode(output, skip_special_tokens=True)[0]
+        self.assertEqual(decoded_text, EXPECTED_DECODED_TEXT)
 
     @slow
     def test_pixtral(self):
Original file line number	Diff line number	Diff line change
`@@ -223,7 +223,7 @@ def test_model_integration_forward(self):`
`223`	`223`
`224`	`224`	`EXPECTED_LOGITS = Expectations(`
`225`	`225`	`{`
`226`		`- ("xpu", 3): [0.4109, 0.1532, 0.8018, 2.1328, 0.5483],`
	`226`	`+ ("xpu", 3): [2.4297, 1.6836, 1.8779, 2.1895, 1.9395],`
`227`	`227`	`# 4-bit`
`228`	`228`	`("cuda", 7): [0.1097, 0.3481, 3.8340, 9.7969, 2.0488],`
`229`	`229`	`("cuda", 8): [2.4277, 1.6875, 1.8789, 2.1875, 1.9375],`
`@@ -264,6 +264,7 @@ def test_model_integration_generate_text_only(self):`
`264`	`264`
`265`	`265`	`expected_outputs = Expectations(`
`266`	`266`	`{`
	`267`	`+ ("xpu", 3): "<\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|>",`
`267`	`268`	`("cuda", 8): "<\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|>",`
`268`	`269`	`}`
`269`	`270`	`) # fmt: skip`
`@@ -298,6 +299,7 @@ def test_model_integration_generate_chat_template(self):`
`298`	`299`
`299`	`300`	`expected_outputs = Expectations(`
`300`	`301`	`{`
	`302`	`+ ("xpu", 3): '<\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|>',`
`301`	`303`	`("cuda", 8): '<\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|>',`
`302`	`304`	`}`
`303`	`305`	`) # fmt: skip`
`@@ -344,6 +346,7 @@ def test_model_integration_batched_generate(self):`
`344`	`346`	`decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)`
`345`	`347`	`expected_outputs = Expectations(`
`346`	`348`	`{`
	`349`	`+ ("xpu", 3): 'Dock stretches to calm',`
`347`	`350`	`("cuda", 8): 'Dock stretches to calm',`
`348`	`351`	`}`
`349`	`352`	`) # fmt: skip`
`@@ -360,6 +363,7 @@ def test_model_integration_batched_generate(self):`
`360`	`363`
`361`	`364`	`expected_outputs = Expectations(`
`362`	`365`	`{`
	`366`	`+ ("xpu", 3): 'The image depicts a',`
`363`	`367`	`("cuda", 8): 'The image depicts a',`
`364`	`368`	`}`
`365`	`369`	`) # fmt: skip`
`@@ -418,6 +422,7 @@ def test_model_integration_batched_generate_multi_image(self):`
`418`	`422`	`# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232`
`419`	`423`	`expected_outputs = Expectations(`
`420`	`424`	`{`
	`425`	`+ ("xpu", 3): '<\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|>',`
`421`	`426`	`("cuda", 8): '<\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|>',`
`422`	`427`	`}`
`423`	`428`	`) # fmt: skip`
`@@ -433,6 +438,7 @@ def test_model_integration_batched_generate_multi_image(self):`
`433`	`438`	`decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)`
`434`	`439`	`expected_outputs = Expectations(`
`435`	`440`	`{`
	`441`	`+ ("xpu", 3): '<\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|>',`
`436`	`442`	`("cuda", 8): '<\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|><\|CHATBOT_TOKEN\|>',`
`437`	`443`	`}`
`438`	`444`	`) # fmt: skip`
Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,7 @@ def test_ernie4_5_0p3B(self):`
`79`	`79`	`"""`
`80`	`80`	`expected_texts = Expectations(`
`81`	`81`	`{`
	`82`	`+ ("xpu", 3): "User: Hey, are you conscious? Can you talk to me?\nAssistant: Hey! I'm here to help you with whatever you need. Are you feeling a bit overwhelmed or stressed? I'm here to listen and provide support.",`
`82`	`83`	`("cuda", None): "User: Hey, are you conscious? Can you talk to me?\nAssistant: Hey! I'm here to help you with whatever you need. Are you feeling a bit overwhelmed or stressed? I'm here to listen and provide support.",`
`83`	`84`	`}`
`84`	`85`	`) # fmt: skip`
Original file line number	Diff line number	Diff line change
`@@ -645,7 +645,7 @@ def test_llama_small_model_integration_forward(self):`
`645`	`645`
`646`	`646`	`expected_logits_all = Expectations(`
`647`	`647`	`{`
`648`		`- ("xpu", 3): [-9.8750, -0.5703, 1.4297, -10.3125, -10.3125],`
	`648`	`+ ("xpu", 3): [-9.8828, -0.4954, 1.4561, -10.3438, -10.3438],`
`649`	`649`	`("cuda", 7): [-9.8750, -0.4861, 1.4648, -10.3359, -10.3359],`
`650`	`650`	`("cuda", 8): [-9.8906, -0.4995, 1.4473, -10.3359, -10.3438],`
`651`	`651`	`("rocm", (9, 4)): [ -9.8828, -0.5005, 1.4697, -10.3438, -10.3438],`
`@@ -680,6 +680,7 @@ def test_llama_small_model_integration_generate_text_only(self):`
`680`	`680`
`681`	`681`	`expected_outputs = Expectations(`
`682`	`682`	`{`
	`683`	`+ ("xpu", 3): "Autumn leaves fall,\nNature's breath, a season's sigh,\nSilent woods awake.",`
`683`	`684`	`("cuda", 7): "Autumn leaves fall,\nNature's breath, a gentle sigh,\nSilent whispers.",`
`684`	`685`	`("cuda", 8): "Autumn leaves fall,\nNature's breath, a silent sigh,\nWinter's chill approaches.",`
`685`	`686`	`}`
`@@ -920,7 +921,7 @@ def test_llama_small_model_integration_interleaved_images_videos(self):`
`920`	`921`	`# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232`
`921`	`922`	`expected_outputs = Expectations(`
`922`	`923`	`{`
`923`		`- ("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",`
	`924`	`+ ("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **",`
`924`	`925`	`("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',`
`925`	`926`	`("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',`
`926`	`927`	`("rocm", (9, 4)): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',`
`@@ -938,7 +939,7 @@ def test_llama_small_model_integration_interleaved_images_videos(self):`
`938`	`939`	`decoded_output = processor.decode(output[1], skip_special_tokens=True)`
`939`	`940`	`expected_outputs = Expectations(`
`940`	`941`	`{`
`941`		`- ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",`
	`942`	`+ ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their",`
`942`	`943`	`("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',`
`943`	`944`	`("cuda", 8): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',`
`944`	`945`	`}`