vllm-project · WoosukKwon · Jun 19, 2025 · Mar 24, 2025 · Apr 3, 2025 · Apr 3, 2025
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
@@ -12,7 +12,10 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True
+        model="intfloat/e5-mistral-7b-instruct",
+        task="embed",
+        enforce_eager=True,
+        max_model_len=1024,
     )
     return parser.parse_args()
 

diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
@@ -94,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
     engine_args = EngineArgs(
         model="TIGER-Lab/VLM2Vec-Full",
         task="embed",
+        max_model_len=4096,
         trust_remote_code=True,
         mm_processor_kwargs={"num_crops": 4},
         limit_mm_per_prompt={"image": 1},

diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
@@ -31,7 +31,7 @@ class TestSetting:
         # basic llama model
         TestSetting(
             model="meta-llama/Llama-3.2-1B-Instruct",
-            model_args=[],
+            model_args=["--max-model-len", "2048"],
             pp_size=2,
             tp_size=2,
             attn_backend="FLASHINFER",
@@ -41,7 +41,7 @@ class TestSetting:
         # llama model with quantization
         TestSetting(
             model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            model_args=["--quantization", "gptq"],
+            model_args=["--quantization", "gptq", "--max-model-len", "2048"],
             pp_size=1,
             tp_size=1,
             attn_backend="FLASH_ATTN",
@@ -51,7 +51,7 @@ class TestSetting:
         # MoE model
         TestSetting(
             model="ibm/PowerMoE-3b",
-            model_args=[],
+            model_args=["--max-model-len", "2048"],
             pp_size=1,
             tp_size=2,
             attn_backend="FLASH_ATTN",
@@ -61,23 +61,27 @@ class TestSetting:
         # embedding model
         TestSetting(
             model="BAAI/bge-multilingual-gemma2",
-            model_args=["--task", "embed", "--dtype", "bfloat16"],
+            model_args=[
+                "--task", "embed", "--dtype", "bfloat16", "--max-model-len",
+                "2048"
+            ],
             pp_size=1,
             tp_size=1,
             attn_backend="FLASH_ATTN",
             method="encode",
             fullgraph=True,
         ),
-        # encoder-based embedding model (BERT)
-        TestSetting(
-            model="BAAI/bge-base-en-v1.5",
-            model_args=["--task", "embed"],
-            pp_size=1,
-            tp_size=1,
-            attn_backend="XFORMERS",
-            method="encode",
-            fullgraph=True,
-        ),
+        # TODO: bert models are not supported in V1 yet
+        # # encoder-based embedding model (BERT)
+        # TestSetting(
+        #     model="BAAI/bge-base-en-v1.5",
+        #     model_args=["--task", "embed"],
+        #     pp_size=1,
+        #     tp_size=1,
+        #     attn_backend="XFORMERS",
+        #     method="encode",
+        #     fullgraph=True,
+        # ),
         # vision language model
         TestSetting(
             model="microsoft/Phi-3.5-vision-instruct",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -145,13 +145,16 @@ def run_with_both_engines(request, monkeypatch):
     # Automatically runs tests twice, once with V1 and once without
     use_v1 = request.param
     # Tests decorated with `@skip_v1` are only run without v1
+    skip_v0 = request.node.get_closest_marker("skip_v0")
     skip_v1 = request.node.get_closest_marker("skip_v1")
 
     if use_v1:
         if skip_v1:
             pytest.skip("Skipping test on vllm V1")
         monkeypatch.setenv('VLLM_USE_V1', '1')
     else:
+        if skip_v0:
+            pytest.skip("Skipping test on vllm V0")
         monkeypatch.setenv('VLLM_USE_V1', '0')
 
     yield

@@ -8,6 +8,8 @@
 from vllm import LLM, PoolingParams, PoolingRequestOutput
 from vllm.distributed import cleanup_dist_env_and_memory
 
+from ...models.utils import check_embeddings_close
+
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
 PROMPTS = [
@@ -27,6 +29,14 @@
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
@@ -46,9 +56,15 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-def assert_outputs_equal(o1: list[PoolingRequestOutput],
+def assert_outputs_match(o1: list[PoolingRequestOutput],
                          o2: list[PoolingRequestOutput]):
-    assert [o.outputs for o in o1] == [o.outputs for o in o2]
+    check_embeddings_close(
+        embeddings_0_lst=[o.outputs.data for o in o1],
+        embeddings_1_lst=[o.outputs.data for o in o2],
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
 
 
 @pytest.mark.skip_global_cleanup
@@ -63,7 +79,7 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
 
     v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
                            pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
+    assert_outputs_match(v1_output, v2_output)
 
 
 @pytest.mark.skip_global_cleanup
@@ -80,7 +96,7 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
         } for p in TOKEN_IDS],
         pooling_params=pooling_params,
     )
-    assert_outputs_equal(v1_output, v2_output)
+    assert_outputs_match(v1_output, v2_output)
 
 
 @pytest.mark.skip_global_cleanup

@@ -21,6 +21,14 @@
 DTYPE = "bfloat16"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.fixture(scope="module")
 def server():
     args = [

@@ -7,6 +7,7 @@
 import pytest
 import requests
 
+from tests.models.utils import check_embeddings_close
 from vllm.entrypoints.openai.protocol import PoolingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -223,8 +224,11 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
             np.frombuffer(base64.b64decode(data.data),
                           dtype="float32").tolist())
 
-    assert responses_float.data[0].data == decoded_responses_base64_data[0]
-    assert responses_float.data[1].data == decoded_responses_base64_data[1]
+    check_embeddings_close(
+        embeddings_0_lst=[d.data for d in responses_float.data],
+        embeddings_1_lst=decoded_responses_base64_data,
+        name_0="float32",
+        name_1="base64")
 
     # Default response is float32 decoded from base64 by OpenAI Client
     default_response = requests.post(
@@ -237,5 +241,8 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
     default_response.raise_for_status()
     responses_default = PoolingResponse.model_validate(default_response.json())
 
-    assert responses_float.data[0].data == responses_default.data[0].data
-    assert responses_float.data[1].data == responses_default.data[1].data
+    check_embeddings_close(
+        embeddings_0_lst=[d.data for d in responses_default.data],
+        embeddings_1_lst=[d.data for d in responses_default.data],
+        name_0="float32",
+        name_1="base64")
@@ -12,6 +12,14 @@
 DTYPE = "bfloat16"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.fixture(scope="module")
 def server():
     args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

@@ -11,6 +11,15 @@
 
 from ...utils import RemoteOpenAIServer
 
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 MODELS = [
     {
         "name": "BAAI/bge-reranker-v2-m3",

@@ -6,6 +6,14 @@
 
 from vllm.platforms import current_platform
 
+# TODO: enable when float32 is supported by V1
+# @pytest.fixture(autouse=True)
+# def v1(run_with_both_engines):
+#     # Simple autouse wrapper to run both engines for each test
+#     # This can be promoted up to conftest.py to run for every
+#     # test in a package
+#     pass
+
 
 @pytest.mark.parametrize(
     "model",
@@ -29,7 +37,7 @@ def test_models(
         # switch to use ROCm CK FA backend
         monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
     with hf_runner(model,

@@ -8,6 +8,14 @@
 from ...utils import check_embeddings_close
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.parametrize(
     "model",
     [
@@ -20,15 +28,27 @@
                      marks=[pytest.mark.core_model]),
         pytest.param("intfloat/e5-mistral-7b-instruct",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
-        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        # the qwen models interfere with each other (see PR
+        # https://github.com/vllm-project/vllm/pull/18720).
+        # To avoid this problem, for now we skip v0 since it will be
+        # deprecated anyway.
+        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
+                     marks=[pytest.mark.skip_v0]),
         # [Encoder-only]
         pytest.param("BAAI/bge-base-en-v1.5",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
-        pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
+                     marks=[
+                         pytest.mark.core_model, pytest.mark.cpu_model,
+                         pytest.mark.skip_v1
+                     ]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
+                     marks=[pytest.mark.skip_v1]),
+        pytest.param("intfloat/multilingual-e5-small",
+                     marks=[pytest.mark.skip_v1]),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+                     marks=[pytest.mark.skip_v1]),
         # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2",
+                     marks=[pytest.mark.skip_v1]),
     ],
 )
 def test_models(
@@ -62,7 +82,7 @@ def test_models(
 
     with vllm_runner(model,
                      task="embed",
-                     max_model_len=None,
+                     max_model_len=512,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
 

@@ -265,8 +265,8 @@ def check_available_online(
 
 _EMBEDDING_EXAMPLE_MODELS = {
     # [Text-only]
-    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
-    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
+    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
+    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True),  # noqa: E501
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                                trust_remote_code=True),
@@ -279,16 +279,16 @@ def check_available_online(
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
-                                trust_remote_code=True),
+                                trust_remote_code=True, v0_only=True),
     "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
-                                               trust_remote_code=True),
+                                               trust_remote_code=True, v0_only=True),  # noqa: E501
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
     "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
     "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
-    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
-    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
-    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),
+    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
+    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501
     # [Multimodal]
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
@@ -300,10 +300,10 @@ def check_available_online(
 
 _CROSS_ENCODER_EXAMPLE_MODELS = {
     # [Text-only]
-    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
-    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
-    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
-    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"),  # noqa: E501
+    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
+    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
+    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True),  # noqa: E501
 }
 
 _MULTIMODAL_EXAMPLE_MODELS = {

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
@@ -68,6 +68,7 @@ def _run_incremental_decode(tokenizer,
                                 None,
                                 params,
                                 None,
+                                None,
                                 0.0,
                                 None,
                                 cache_salt=None,

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
@@ -43,6 +43,7 @@ def make_request(request_id,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
+        pooling_params=None,
         eos_token_id=100,
         lora_request=None,
         cache_salt=cache_salt,

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
@@ -39,6 +39,7 @@ def make_request(request_id,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17,
                                        prompt_logprobs=prompt_logprobs),
+        pooling_params=None,
         eos_token_id=100,
         lora_request=None,
         cache_salt=cache_salt,
-Original file line number
+Diff line change
@@ Expand Up / @@ -68,6 +68,7 @@ def _run_incremental_decode(tokenizer, @@
                                     None,
                                     params,
                                     None,
+                                    None,
 .0,
                                     None,
                                     cache_salt=None,
@@ Expand Down @@