Skip to content

[CI] Add mteb testing to test the accuracy of the embedding model #17175

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.51.3
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
Expand Down
22 changes: 21 additions & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ datasets==3.0.2
# via
# evaluate
# lm-eval
# mteb
decorator==5.1.1
# via librosa
dill==0.3.8
Expand All @@ -124,6 +125,8 @@ email-validator==2.2.0
# via pydantic
encodec==0.1.1
# via vocos
eval-type-backport==0.2.2
# via mteb
evaluate==0.4.3
# via lm-eval
fastparquet==2024.11.0
Expand Down Expand Up @@ -291,6 +294,8 @@ msgpack==1.1.0
# via
# librosa
# ray
mteb==1.38.11
# via -r requirements/test.in
multidict==6.1.0
# via
# aiohttp
Expand Down Expand Up @@ -331,6 +336,7 @@ numpy==1.26.4
# librosa
# matplotlib
# mistral-common
# mteb
# numba
# numexpr
# opencv-python-headless
Expand Down Expand Up @@ -443,6 +449,8 @@ plotly==5.24.1
# via genai-perf
pluggy==1.5.0
# via pytest
polars==1.29.0
# via mteb
pooch==1.8.2
# via librosa
portalocker==2.10.1
Expand Down Expand Up @@ -476,6 +484,7 @@ pydantic==2.9.2
# via
# datamodel-code-generator
# mistral-common
# mteb
pydantic-core==2.23.4
# via pydantic
pygments==2.18.0
Expand Down Expand Up @@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0
# typepy
python-rapidjson==1.20
# via tritonclient
pytrec-eval-terrier==0.5.7
# via mteb
pytz==2024.2
# via
# pandas
Expand Down Expand Up @@ -564,6 +575,7 @@ requests==2.32.3
# huggingface-hub
# lm-eval
# mistral-common
# mteb
# pooch
# ray
# responses
Expand All @@ -580,6 +592,7 @@ rfc3987==1.3.8
rich==13.9.4
# via
# genai-perf
# mteb
# typer
rouge-score==0.1.2
# via lm-eval
Expand Down Expand Up @@ -607,16 +620,20 @@ scikit-learn==1.5.2
# via
# librosa
# lm-eval
# mteb
# sentence-transformers
scipy==1.13.1
# via
# librosa
# mteb
# scikit-learn
# sentence-transformers
# statsmodels
# vocos
sentence-transformers==3.2.1
# via -r requirements/test.in
# via
# -r requirements/test.in
# mteb
sentencepiece==0.2.0
# via mistral-common
setuptools==77.0.3
Expand Down Expand Up @@ -696,6 +713,7 @@ torch==2.7.0+cu128
# fastsafetensors
# lm-eval
# mamba-ssm
# mteb
# peft
# runai-model-streamer
# sentence-transformers
Expand All @@ -720,6 +738,7 @@ tqdm==4.66.6
# evaluate
# huggingface-hub
# lm-eval
# mteb
# nltk
# peft
# pqdm
Expand Down Expand Up @@ -759,6 +778,7 @@ typing-extensions==4.12.2
# huggingface-hub
# librosa
# mistral-common
# mteb
# pqdm
# pydantic
# pydantic-core
Expand Down
42 changes: 42 additions & 0 deletions tests/entrypoints/openai/correctness/test_mteb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
import math
import os

import pytest

from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
OpenAIClientMtebEncoder,
run_mteb_embed_task,
run_mteb_embed_task_st)
from tests.utils import RemoteOpenAIServer

os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

MODEL_NAME = "BAAI/bge-m3"
DTYPE = "float16"
MAIN_SCORE = 0.7873427091972599


@pytest.fixture(scope="module")
def server():
args = [
"--task", "embed", "--dtype", DTYPE, "--enforce-eager",
"--max-model-len", "512"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


def test_mteb(server):
client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
MODEL_NAME, MTEB_EMBED_TASKS)

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)

assert math.isclose(st_main_score, vllm_main_score, rel_tol=1e-4)
2 changes: 0 additions & 2 deletions tests/models/language/pooling/test_gte.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@
@pytest.mark.parametrize("model_info", MODELS)
def test_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
pytest.skip("Skipping mteb test.")

from .mteb_utils import mteb_test_embed_models

vllm_extra_kwargs: dict[str, Any] = {}
Expand Down
1 change: 0 additions & 1 deletion tests/models/language/pooling/test_nomic.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
@pytest.mark.parametrize("model_info", MODELS)
def test_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models
mteb_test_embed_models(hf_runner, vllm_runner, model_info)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def test_models_mteb(
vllm_runner,
model_info: EmbedModelInfo,
) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models
mteb_test_embed_models(hf_runner, vllm_runner, model_info)

Expand Down