Skip to content

Improve the output precision of embedding models #19092

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions tests/models/language/pooling/embed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,10 @@ def correctness_test_embed_models(hf_runner,
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
model_dtype = getattr(
vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype",
vllm_dtype)

with hf_runner(
model_info.name,
dtype=model_dtype,
dtype="float32",
is_sentence_transformer=True,
) as hf_model:

Expand Down
12 changes: 6 additions & 6 deletions tests/models/language/pooling/mteb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import pytest

from tests.models.utils import EmbedModelInfo
from vllm.model_executor.model_loader.utils import set_default_torch_dtype

# Most models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
Expand Down Expand Up @@ -103,17 +102,18 @@ def mteb_test_embed_models(hf_runner,
MTEB_EMBED_TASKS)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype

with set_default_torch_dtype(vllm_dtype) and hf_runner(
model_info.name, is_sentence_transformer=True,
dtype=vllm_dtype) as hf_model:
with hf_runner(model_info.name,
is_sentence_transformer=True,
dtype="float32") as hf_model:

if hf_model_callback is not None:
hf_model_callback(hf_model)

st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype

print("VLLM:", vllm_main_score)
print("SentenceTransformers:", st_main_score)
print("VLLM:", vllm_dtype, vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
7 changes: 0 additions & 7 deletions tests/models/language/pooling/test_gte.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,21 @@
########## BertModel
EmbedModelInfo("thenlper/gte-large",
architecture="BertModel",
dtype="float32",
enable_test=True),
EmbedModelInfo("thenlper/gte-base",
architecture="BertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("thenlper/gte-small",
architecture="BertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("thenlper/gte-large-zh",
architecture="BertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("thenlper/gte-base-zh",
architecture="BertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("thenlper/gte-small-zh",
architecture="BertModel",
dtype="float32",
enable_test=False),
########### NewModel
EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
Expand All @@ -45,7 +39,6 @@
########### Qwen2ForCausalLM
EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
architecture="Qwen2ForCausalLM",
dtype="float32",
enable_test=True),
########## ModernBertModel
EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
Expand Down
46 changes: 46 additions & 0 deletions tests/models/language/pooling/test_intfloat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
import pytest

from ...utils import EmbedModelInfo
from .embed_utils import correctness_test_embed_models
from .mteb_utils import mteb_test_embed_models

MODELS = [
########## BertModel
EmbedModelInfo("intfloat/e5-small",
architecture="BertModel",
enable_test=True),
EmbedModelInfo("intfloat/e5-base",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("intfloat/e5-large",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("intfloat/multilingual-e5-small",
architecture="BertModel",
enable_test=False),
########## XLMRobertaModel
EmbedModelInfo("intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
enable_test=True),
EmbedModelInfo("intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False),
EmbedModelInfo("intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False),
]


@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)


@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
3 changes: 1 addition & 2 deletions tests/models/language/pooling/test_jina.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@
EMBEDDING_MODELS = [
EmbedModelInfo("jinaai/jina-embeddings-v3",
architecture="XLMRobertaModel",
is_matryoshka=True,
dtype="float32")
is_matryoshka=True)
]


Expand Down
3 changes: 0 additions & 3 deletions tests/models/language/pooling/test_nomic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,15 @@
MODELS = [
EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
dtype="float32",
enable_test=True),
EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("nomic-ai/CodeRankEmbed",
architecture="NomicBertModel",
enable_test=False),
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
dtype="float32",
enable_test=True)
]

Expand Down
13 changes: 9 additions & 4 deletions vllm/model_executor/models/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,10 +413,15 @@ def forward(
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
return self.model(input_ids=input_ids,
position_ids=positions,
inputs_embeds=inputs_embeds,
intermediate_tensors=intermediate_tensors)
hidden_states = self.model(input_ids=input_ids,
position_ids=positions,
inputs_embeds=inputs_embeds,
intermediate_tensors=intermediate_tensors)

# convert the embedding output to float32,
# otherwise precision will be lost significantly
hidden_states = hidden_states.to(torch.float32)
return hidden_states

def pooler(
self,
Expand Down
7 changes: 6 additions & 1 deletion vllm/model_executor/models/bert_with_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,12 @@ def forward(
else:
hidden_states = self.embeddings(input_ids=input_ids,
token_type_ids=token_type_ids)
return self.encoder(positions, hidden_states)
hidden_states = self.encoder(positions, hidden_states)

# convert the embedding output to float32,
# otherwise precision will be lost significantly
hidden_states = hidden_states.to(torch.float32)
return hidden_states

def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
Expand Down