vllm-project · wangxiyuan · Jun 30, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -86,7 +86,7 @@ jobs:
       - name: Run codespell check
         run: |
           CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
-          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn')
+          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn')
 
           codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
       - name: Analysing the code with ruff
@@ -262,11 +262,13 @@ jobs:
             pytest -sv tests/e2e/singlecard/test_ilama_lora.py
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
           pytest -sv tests/e2e/singlecard/test_camem.py
+          pytest -sv tests/e2e/singlecard/test_embedding.py
           pytest -sv tests/e2e/singlecard/ \
           --ignore=tests/e2e/singlecard/test_offline_inference.py \
           --ignore=tests/e2e/singlecard/test_ilama_lora.py \
           --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-          --ignore=tests/e2e/singlecard/test_camem.py
+          --ignore=tests/e2e/singlecard/test_camem.py \
+          --ignore=tests/e2e/singlecard/test_embedding.py
 
       - name: Run e2e test on V0 engine
         if: ${{ github.event_name == 'schedule' }}
@@ -281,14 +283,16 @@ jobs:
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
           pytest -sv tests/e2e/singlecard/test_camem.py
           pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
+          pytest -sv tests/e2e/singlecard/test_embedding.py
           pytest -sv tests/e2e/singlecard/ \
             --ignore=tests/e2e/singlecard/test_offline_inference.py \
             --ignore=tests/e2e/singlecard/test_ilama_lora.py \
             --ignore=tests/e2e/singlecard/test_guided_decoding.py \
             --ignore=tests/e2e/singlecard/test_camem.py \
             --ignore=tests/e2e/singlecard/test_prompt_embedding.py \
             --ignore=tests/e2e/singlecard/core/test_ascend_scheduler.py \
-            --ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py
+            --ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py \
+            --ignore=tests/e2e/singlecard/test_embedding.py
 
   e2e-4-cards:
     needs: [e2e]

diff --git a/examples/offline_embed.py b/examples/offline_embed.py
@@ -0,0 +1,53 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B
+#
+
+import os
+
+import torch
+from vllm import LLM
+
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+
+
+def get_detailed_instruct(task_description: str, query: str) -> str:
+    return f'Instruct: {task_description}\nQuery:{query}'
+
+
+# Each query must come with a one-sentence instruction that describes the task
+task = 'Given a web search query, retrieve relevant passages that answer the query'
+
+queries = [
+    get_detailed_instruct(task, 'What is the capital of China?'),
+    get_detailed_instruct(task, 'Explain gravity')
+]
+# No need to add instruction for retrieval documents
+documents = [
+    "The capital of China is Beijing.",
+    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
+]
+input_texts = queries + documents
+
+model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
+
+outputs = model.embed(input_texts)
+embeddings = torch.tensor([o.outputs.embedding for o in outputs])
+# Calculate the similarity scores between the first two queries and the last two documents
+scores = (embeddings[:2] @ embeddings[2:].T)
+print(scores.tolist())
+# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -12,3 +12,4 @@ xgrammar
 zmq
 types-psutil
 pytest-cov
+sentence_transformers
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -19,18 +19,23 @@
 
 import contextlib
 import gc
-from typing import List, Optional, Tuple, TypeVar, Union
+from typing import Any, List, Optional, Tuple, TypeVar, Union
 
 import numpy as np
 import pytest
 import torch
 from huggingface_hub import snapshot_download
 from PIL import Image
+from torch import nn
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BatchEncoding, BatchFeature)
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm import LLM, SamplingParams
-from vllm.config import TaskOption
+from vllm.config import TaskOption, _get_and_verify_dtype
 from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils import is_list_of
 
 from tests.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
@@ -45,6 +50,7 @@
 from vllm.distributed.parallel_state import (  # noqa E402
     destroy_distributed_environment, destroy_model_parallel)
 
+_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
 _M = TypeVar("_M")
 
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
@@ -364,3 +370,131 @@
 @pytest.fixture(scope="session")
 def ilama_lora_files():
     return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")
+
+
+class HfRunner:
+
+    def get_default_device(self):
+        from vllm.platforms import current_platform
+
+        return ("cpu"
+                if current_platform.is_cpu() else current_platform.device_type)
+
+    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+        if x is None or isinstance(x, (bool, )):
+            return x
+
+        if device is None:
+            device = self.device
+
+        if isinstance(x, dict):
+            return {k: self.wrap_device(v, device) for k, v in x.items()}
+
+        if hasattr(x, "device") and x.device.type == device:
+            return x
+
+        return x.to(device)
+
+    def __init__(
+        self,
+        model_name: str,
+        dtype: str = "auto",
+        *,
+        model_kwargs: Optional[dict[str, Any]] = None,
+        trust_remote_code: bool = True,
+        is_sentence_transformer: bool = False,
+        is_cross_encoder: bool = False,
+        skip_tokenizer_init: bool = False,
+        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
+    ) -> None:
+        model_name = maybe_model_redirect(model_name)
+        self.model_name = model_name
+
+        self.config = AutoConfig.from_pretrained(
+            model_name,
+            trust_remote_code=trust_remote_code,
+        )
+        self.device = self.get_default_device()
+        self.dtype = torch_dtype = _get_and_verify_dtype(
+            self.model_name,
+            self.config,
+            dtype=dtype,
+            is_pooling_model=is_sentence_transformer or is_cross_encoder,
+        )
+
+        model_kwargs = model_kwargs if model_kwargs is not None else {}
+        model_kwargs.setdefault("torch_dtype", torch_dtype)
+
+        if is_sentence_transformer:
+            # Lazy init required for AMD CI
+            from sentence_transformers import SentenceTransformer
+
+            self.model = SentenceTransformer(
+                model_name,
+                device=self.device,
+                model_kwargs=model_kwargs,
+                trust_remote_code=trust_remote_code,
+            )
+        elif is_cross_encoder:
+            # Lazy init required for AMD CI
+            from sentence_transformers import CrossEncoder
+
+            self.model = CrossEncoder(
+                model_name,
+                device=self.device,
+                automodel_args=model_kwargs,
+                trust_remote_code=trust_remote_code,
+            )
+        else:
+            model = auto_cls.from_pretrained(
+                model_name,
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+
+            # in case some unquantized custom models are not in same dtype
+            if (getattr(model, "quantization_method", None) is None
+                    and any(p.dtype != self.dtype
+                            for p in model.parameters())):
+                model = model.to(dtype=self.dtype)
+
+            if (getattr(model, "quantization_method", None) != "bitsandbytes"
+                    and len({p.device
+                             for p in model.parameters()}) < 2):
+                model = model.to(device=self.device)
+
+            self.model = model
+
+        if not skip_tokenizer_init:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+        # don't put this import at the top level
+        # it will call torch.cuda.device_count()
+        from transformers import AutoProcessor  # noqa: F401
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+        )
+        if skip_tokenizer_init:
+            self.tokenizer = self.processor.tokenizer
+
+    def encode(self, prompts: list[str], *args,
+               **kwargs) -> list[list[torch.Tensor]]:
+        return self.model.encode(prompts, *args, **kwargs)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        del self.model
+        cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="session")
+def hf_runner():
+    return HfRunner
diff --git a/tests/e2e/singlecard/test_embedding.py b/tests/e2e/singlecard/test_embedding.py
@@ -0,0 +1,72 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+from collections.abc import Sequence
+from typing import Optional
+
+import pytest
+from modelscope import snapshot_download  # type: ignore[import-untyped]
+
+from tests.conftest import HfRunner
+from tests.utils import check_embeddings_close, matryoshka_fy
+from vllm_ascend.utils import vllm_version_is
+
+
+def run_embedding_correctness_test(
+    hf_model: "HfRunner",
+    inputs: list[str],
+    vllm_outputs: Sequence[list[float]],
+    dimensions: Optional[int] = None,
+):
+    hf_outputs = hf_model.encode(inputs)
+    if dimensions:
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
+
+
+# dummy to avoid pytest collect nothing and exit code 5
+def test_dummy():
+    assert True
+
+
+@pytest.mark.skipif(vllm_version_is("0.9.1"),
+                    reason="vLLM 0.9.1 does not support embed task for v1")
+def test_embed_models_correctness(hf_runner, vllm_runner):
+    queries = ['What is the capital of China?', 'Explain gravity']
+
+    model_name = snapshot_download("Qwen/Qwen3-Embedding-0.6B")
+    with vllm_runner(
+            model_name,
+            task="embed",
+            enforce_eager=True,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.encode(queries)
+
+    with hf_runner(
+            model_name,
+            dtype="float32",
+            is_sentence_transformer=True,
+    ) as hf_model:
+        run_embedding_correctness_test(hf_model, queries, vllm_outputs)