diff --git a/requirements/test.in b/requirements/test.in index cdc7c563f087..87af61769038 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -33,6 +33,7 @@ num2words # required for smolvlm test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test +mteb>=1.38.11, <2 # required for mteb test transformers==4.51.3 tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads. diff --git a/requirements/test.txt b/requirements/test.txt index 9a15d9a0d824..89d477017342 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -99,6 +99,7 @@ datasets==3.0.2 # via # evaluate # lm-eval + # mteb decorator==5.1.1 # via librosa dill==0.3.8 @@ -124,6 +125,8 @@ email-validator==2.2.0 # via pydantic encodec==0.1.1 # via vocos +eval-type-backport==0.2.2 + # via mteb evaluate==0.4.3 # via lm-eval fastparquet==2024.11.0 @@ -291,6 +294,8 @@ msgpack==1.1.0 # via # librosa # ray +mteb==1.38.11 + # via -r requirements/test.in multidict==6.1.0 # via # aiohttp @@ -331,6 +336,7 @@ numpy==1.26.4 # librosa # matplotlib # mistral-common + # mteb # numba # numexpr # opencv-python-headless @@ -443,6 +449,8 @@ plotly==5.24.1 # via genai-perf pluggy==1.5.0 # via pytest +polars==1.29.0 + # via mteb pooch==1.8.2 # via librosa portalocker==2.10.1 @@ -476,6 +484,7 @@ pydantic==2.9.2 # via # datamodel-code-generator # mistral-common + # mteb pydantic-core==2.23.4 # via pydantic pygments==2.18.0 @@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0 # typepy python-rapidjson==1.20 # via tritonclient +pytrec-eval-terrier==0.5.7 + # via mteb pytz==2024.2 # via # pandas @@ -564,6 +575,7 @@ requests==2.32.3 # huggingface-hub # lm-eval # mistral-common + # mteb # pooch # ray # responses @@ -580,6 +592,7 @@ rfc3987==1.3.8 rich==13.9.4 # via # genai-perf + # mteb # typer rouge-score==0.1.2 # via lm-eval @@ -607,16 +620,20 @@ scikit-learn==1.5.2 # via # librosa # lm-eval + # mteb # sentence-transformers scipy==1.13.1 # via # librosa + # mteb # scikit-learn # sentence-transformers # statsmodels # vocos sentence-transformers==3.2.1 - # via -r requirements/test.in + # via + # -r requirements/test.in + # mteb sentencepiece==0.2.0 # via mistral-common setuptools==77.0.3 @@ -696,6 +713,7 @@ torch==2.7.0+cu128 # fastsafetensors # lm-eval # mamba-ssm + # mteb # peft # runai-model-streamer # sentence-transformers @@ -720,6 +738,7 @@ tqdm==4.66.6 # evaluate # huggingface-hub # lm-eval + # mteb # nltk # peft # pqdm @@ -759,6 +778,7 @@ typing-extensions==4.12.2 # huggingface-hub # librosa # mistral-common + # mteb # pqdm # pydantic # pydantic-core diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py new file mode 100644 index 000000000000..b702e0acd38b --- /dev/null +++ b/tests/entrypoints/openai/correctness/test_mteb.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +import math +import os + +import pytest + +from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS, + OpenAIClientMtebEncoder, + run_mteb_embed_task, + run_mteb_embed_task_st) +from tests.utils import RemoteOpenAIServer + +os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" + +MODEL_NAME = "BAAI/bge-m3" +DTYPE = "float16" +MAIN_SCORE = 0.7873427091972599 + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", "embed", "--dtype", DTYPE, "--enforce-eager", + "--max-model-len", "512" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +def test_mteb(server): + client = server.get_client() + encoder = OpenAIClientMtebEncoder(MODEL_NAME, client) + vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS) + st_main_score = MAIN_SCORE or run_mteb_embed_task_st( + MODEL_NAME, MTEB_EMBED_TASKS) + + print("VLLM main score: ", vllm_main_score) + print("SentenceTransformer main score: ", st_main_score) + print("Difference: ", st_main_score - vllm_main_score) + + assert math.isclose(st_main_score, vllm_main_score, rel_tol=1e-4) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 3ccf2999664c..b60d27aaa72b 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -58,8 +58,6 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - pytest.skip("Skipping mteb test.") - from .mteb_utils import mteb_test_embed_models vllm_extra_kwargs: dict[str, Any] = {} diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 6e9de30f977d..28df32e0c230 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -23,7 +23,6 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - pytest.skip("Skipping mteb test.") from .mteb_utils import mteb_test_embed_models mteb_test_embed_models(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index 7d9c3c73d852..5679e0e1ce00 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -46,7 +46,6 @@ def test_models_mteb( vllm_runner, model_info: EmbedModelInfo, ) -> None: - pytest.skip("Skipping mteb test.") from .mteb_utils import mteb_test_embed_models mteb_test_embed_models(hf_runner, vllm_runner, model_info)