Skip to content

Add support for encoder embedding models using MultiModal args #20026

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 5 additions & 11 deletions tests/models/language/pooling/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,13 @@ def v1(run_with_both_engines):
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
# [Encoder-only]
pytest.param("BAAI/bge-base-en-v1.5",
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.skip_v1
]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
marks=[pytest.mark.skip_v1]),
pytest.param("intfloat/multilingual-e5-small",
marks=[pytest.mark.skip_v1]),
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"),
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
marks=[pytest.mark.skip_v1]),
marks=[pytest.mark.skip_v0]),
# [Cross-Encoder]
pytest.param("sentence-transformers/stsb-roberta-base-v2",
marks=[pytest.mark.skip_v1]),
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
],
)
def test_models(
Expand Down
8 changes: 8 additions & 0 deletions tests/models/language/pooling/test_jina.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
Expand Down
9 changes: 9 additions & 0 deletions tests/models/language/pooling/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
"The capital of Germany is Berlin.",
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


DTYPE = "half"


Expand Down
2 changes: 1 addition & 1 deletion tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,4 +916,4 @@ def test_get_kv_cache_config():
],
kv_cache_groups=[
KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
])
])
5 changes: 5 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,11 @@ def _init_pooler_config(self) -> Optional["PoolerConfig"]:
self.override_pooler_config = PoolerConfig(
**self.override_pooler_config)

# WIP: currently cuda graphs are not working for encoder models.
logger.warning("CUDA graph is not supported for pooling yet, "
"fallback to the eager mode.")
self.enforce_eager = True
Comment on lines +720 to +722
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This warning message and the subsequent enforce_eager setting seem specific to the pooling functionality. Consider encapsulating this logic within the if self.runner_type == "pooling" block to avoid unintended side effects for other model types.

            if isinstance(self.override_pooler_config, dict):
                self.override_pooler_config = PoolerConfig(
                    **self.override_pooler_config)

            # WIP: currently cuda graphs are not working for encoder models.
            logger.warning("CUDA graph is not supported for pooling yet, "
                           "fallback to the eager mode.")
            self.enforce_eager = True


pooler_config = self.override_pooler_config or PoolerConfig()

base_config = get_pooling_config(self.model, self.revision)
Expand Down
3 changes: 2 additions & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1664,7 +1664,8 @@ def _set_default_args_v1(self, usage_context: UsageContext,

if (self.max_num_seqs is None
and usage_context in default_max_num_seqs):
self.max_num_seqs = default_max_num_seqs[usage_context]
self.max_num_seqs = min(default_max_num_seqs[usage_context],
self.max_num_batched_tokens)

logger.debug("Setting max_num_seqs to %d for %s usage context.",
self.max_num_seqs, use_context_value)
Expand Down
11 changes: 10 additions & 1 deletion vllm/entrypoints/openai/serving_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from fastapi import Request

import vllm.envs as envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
Expand Down Expand Up @@ -180,9 +181,17 @@
input_ids = prompt_inputs["input_ids"]
text_token_prompt = \
self._validate_input(request, input_ids, request_prompt)

token_type_ids = prompt_inputs.get("token_type_ids")
mm_data = None
if envs.VLLM_USE_V1 and token_type_ids is not None:
mm_data = {"token_type_ids": token_type_ids}
token_type_ids = None

engine_prompt = TokensPrompt(
prompt_token_ids=text_token_prompt["prompt_token_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
token_type_ids=token_type_ids,
multi_modal_data=mm_data)

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "dict[str, Any] | None", TypedDict item "multi_modal_data" has type "Mapping[str, Any | list[Any]]") [typeddict-item]

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "dict[str, Any] | None", TypedDict item "multi_modal_data" has type "Mapping[str, Any | list[Any]]") [typeddict-item]

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "dict[str, Any] | None", TypedDict item "multi_modal_data" has type "Mapping[str, Any | list[Any]]") [typeddict-item]

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "dict[str, Any] | None", TypedDict item "multi_modal_data" has type "Mapping[str, Any | list[Any]]") [typeddict-item]

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "dict[str, Any] | None", TypedDict item "multi_modal_data" has type "Mapping[str, Any | list[Any]]") [typeddict-item]

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "dict[str, Any] | None", TypedDict item "multi_modal_data" has type "Mapping[str, Any | list[Any]]") [typeddict-item]

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "Optional[dict[str, Any]]", TypedDict item "multi_modal_data" has type "Mapping[str, Union[Any, list[Any]]]") [typeddict-item]

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "Optional[dict[str, Any]]", TypedDict item "multi_modal_data" has type "Mapping[str, Union[Any, list[Any]]]") [typeddict-item]

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "Optional[dict[str, Any]]", TypedDict item "multi_modal_data" has type "Mapping[str, Union[Any, list[Any]]]") [typeddict-item]

Check failure on line 194 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types (expression has type "Optional[dict[str, Any]]", TypedDict item "multi_modal_data" has type "Mapping[str, Union[Any, list[Any]]]") [typeddict-item]

request_prompts.append(request_prompt)
engine_prompts.append(engine_prompt)
Expand Down
Loading