vllm-project · ywang96 · Jun 21, 2024 · Jun 20, 2024
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -233,11 +233,13 @@ def generate_greedy(
         prompts: List[str],
         max_tokens: int,
         images: Optional[List[Image.Image]] = None,
+        **kwargs,
     ) -> List[Tuple[List[int], str]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
                                 max_new_tokens=max_tokens,
-                                images=images)
+                                images=images,
+                                **kwargs)
 
         return [(output_ids[0], output_str[0])
                 for output_ids, output_str in outputs]

@@ -77,7 +77,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 # numeric difference for longer context and test can't pass
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
@@ -95,9 +95,11 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     hf_model_kwargs = {"_attn_implementation": "eager"}
     with hf_runner(model_id, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
+        hf_outputs = hf_model.generate_greedy(
+            HF_IMAGE_PROMPTS,
+            max_tokens,
+            images=hf_images,
+            eos_token_id=hf_model.processor.tokenizer.eos_token_id)
 
     vllm_image_prompts = [
         p.replace("<|image_1|>",