diff --git a/tests/entrypoints/openai/correctness/test_mteb_embed.py b/tests/entrypoints/openai/correctness/test_mteb_embed.py index 783f7d3e0d5aa..1601c18d9b787 100644 --- a/tests/entrypoints/openai/correctness/test_mteb_embed.py +++ b/tests/entrypoints/openai/correctness/test_mteb_embed.py @@ -37,4 +37,6 @@ def test_mteb_embed(server): print("SentenceTransformer main score: ", st_main_score) print("Difference: ", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < MTEB_EMBED_TOL diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/openai/correctness/test_mteb_score.py index cfb865815c9b2..417f85adc6e06 100644 --- a/tests/entrypoints/openai/correctness/test_mteb_score.py +++ b/tests/entrypoints/openai/correctness/test_mteb_score.py @@ -6,16 +6,19 @@ import pytest # yapf conflicts with isort for this block # yapf: disable -from tests.models.language.pooling.mteb_utils import ( - MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL, - RerankClientMtebEncoder, ScoreClientMtebEncoder, - mteb_test_rerank_models_hf, run_mteb_rerank) +from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS, + MTEB_RERANK_TASKS, + MTEB_RERANK_TOL, + RerankClientMtebEncoder, + ScoreClientMtebEncoder, + run_mteb_rerank) # yapf: enable from tests.utils import RemoteOpenAIServer os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2" +st_main_score = 0.33457 @pytest.fixture(scope="module") @@ -29,15 +32,7 @@ def server(): yield remote_server -@pytest.fixture(scope="module") -def st_main_score(hf_runner): - # The main score related to the version of the dependency. - # So we need to recalculate every time. - main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME) - return main_score - - -def test_mteb_score(server, st_main_score): +def test_mteb_score(server): url = server.url_for("score") encoder = ScoreClientMtebEncoder(MODEL_NAME, url) vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, @@ -47,10 +42,12 @@ def test_mteb_score(server, st_main_score): print("SentenceTransformer main score: ", st_main_score) print("Difference: ", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < MTEB_RERANK_TOL -def test_mteb_rerank(server, st_main_score): +def test_mteb_rerank(server): url = server.url_for("rerank") encoder = RerankClientMtebEncoder(MODEL_NAME, url) vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, @@ -60,4 +57,6 @@ def test_mteb_rerank(server, st_main_score): print("SentenceTransformer main score: ", st_main_score) print("Difference: ", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < MTEB_RERANK_TOL diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index a74ad2aa25972..8f8393c4e16fc 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner, example_prompts, vllm_extra_kwargs=None, hf_model_callback=None): - if not model_info.enable_test: - # A model family has many models with the same architecture, - # and we don't need to test each one. - pytest.skip("Skipping test.") + pytest.skip("Debug only, ci prefers to use mteb test.") # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 640858125bfca..7be1bba2ff69f 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -18,7 +18,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo # - Different model results in differences more than 1e-3 # 1e-4 is a good tolerance threshold MTEB_EMBED_TASKS = ["STS12"] -MTEB_EMBED_TOL = 0.02 +MTEB_EMBED_TOL = 1e-4 # See #19344 MTEB_RERANK_TASKS = ["NFCorpus"] @@ -192,22 +192,28 @@ def mteb_test_embed_models(hf_runner, MTEB_EMBED_TASKS) vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype - with hf_runner(model_info.name, - is_sentence_transformer=True, - dtype="float32") as hf_model: + if model_info.mteb_score is None: + with hf_runner(model_info.name, + is_sentence_transformer=True, + dtype="float32") as hf_model: - if hf_model_callback is not None: - hf_model_callback(hf_model) + if hf_model_callback is not None: + hf_model_callback(hf_model) - st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) - st_dtype = next(hf_model.model.parameters()).dtype + st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) + st_dtype = next(hf_model.model.parameters()).dtype + else: + st_main_score = model_info.mteb_score + st_dtype = "Constant" print("Model:", model_info.name) print("VLLM:", vllm_dtype, vllm_main_score) print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=atol) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < atol def run_mteb_rerank(cross_encoder, tasks, languages): @@ -310,12 +316,18 @@ def mteb_test_rerank_models(hf_runner, languages=MTEB_RERANK_LANGS) vllm_dtype = model_config.dtype - st_main_score, st_dtype = mteb_test_rerank_models_hf( - hf_runner, model_info.name, hf_model_callback) + if model_info.mteb_score is None: + st_main_score, st_dtype = mteb_test_rerank_models_hf( + hf_runner, model_info.name, hf_model_callback) + else: + st_main_score = model_info.mteb_score + st_dtype = "Constant" print("Model:", model_info.name) print("VLLM:", vllm_dtype, vllm_main_score) print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=atol) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < atol diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py index 6fbe0e82d7f8a..be8cb6fa76994 100644 --- a/tests/models/language/pooling/test_baai.py +++ b/tests/models/language/pooling/test_baai.py @@ -12,6 +12,7 @@ MODELS = [ ########## BertModel CLSPoolingEmbedModelInfo("BAAI/bge-base-en", architecture="BertModel", + mteb_score=0.779336792, enable_test=True), CLSPoolingEmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", @@ -52,10 +53,12 @@ MODELS = [ ########## XLMRobertaModel CLSPoolingEmbedModelInfo("BAAI/bge-m3", architecture="XLMRobertaModel", + mteb_score=0.787343078, enable_test=True), ########## Qwen2Model LASTPoolingEmbedModelInfo("BAAI/bge-code-v1", architecture="Qwen2Model", + mteb_score=0.75724465, dtype="float32", enable_test=True), ] @@ -65,6 +68,7 @@ RERANK_MODELS = [ CLSPoolingRerankModelInfo( "BAAI/bge-reranker-base", architecture="XLMRobertaForSequenceClassification", + mteb_score=0.32398, enable_test=True), CLSPoolingRerankModelInfo( "BAAI/bge-reranker-large", diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py index f473e0ba01ffa..eaa8bfb84ffdd 100644 --- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py @@ -104,7 +104,6 @@ class GemmaMtebEncoder(VllmMtebEncoder): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.prompt = PROMPT self.query_template = "A: {query}\n" self.document_template = "B: {doc}\n{prompt}" @@ -119,7 +118,7 @@ class GemmaMtebEncoder(VllmMtebEncoder): _sentences = [] for query, corpus, prompt in sentences: query = self.query_template.format(query=query) - corpus = self.document_template.format(doc=corpus, prompt=prompt) + corpus = self.document_template.format(doc=corpus, prompt=PROMPT) _sentences.append((query, corpus, prompt)) return super().predict(_sentences, *args, **kwargs) diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling/test_cross_encoder.py index 8c1bc5779b8a1..b49908c9ce6a6 100644 --- a/tests/models/language/pooling/test_cross_encoder.py +++ b/tests/models/language/pooling/test_cross_encoder.py @@ -8,8 +8,10 @@ from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2", + mteb_score=0.32898, architecture="BertForSequenceClassification"), LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", + mteb_score=0.25736, architecture="Qwen3ForSequenceClassification") ] diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index f918b2b91bcc3..0733ac85c11fc 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -7,7 +7,7 @@ import pytest from vllm.config import PoolerConfig from vllm.platforms import current_platform -from ...utils import check_embeddings_close, check_transformers_version +from ...utils import check_embeddings_close @pytest.mark.parametrize( @@ -30,7 +30,6 @@ from ...utils import check_embeddings_close, check_transformers_version pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), - pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), # [Cross-Encoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], @@ -42,8 +41,6 @@ def test_models( model, monkeypatch, ) -> None: - if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - check_transformers_version(model, max_transformers_version="4.53.2") if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm(): # ROCm Triton FA does not currently support sliding window attention diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 9911620c018ef..98d215b0ad25e 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -5,13 +5,14 @@ import pytest from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, EmbedModelInfo, LASTPoolingEmbedModelInfo, - RerankModelInfo, check_transformers_version) + RerankModelInfo) from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ ########## BertModel CLSPoolingEmbedModelInfo("thenlper/gte-large", + mteb_score=0.76807651, architecture="BertModel", enable_test=True), CLSPoolingEmbedModelInfo("thenlper/gte-base", @@ -30,28 +31,37 @@ MODELS = [ architecture="BertModel", enable_test=False), ########### NewModel + # These three architectures are almost the same, but not exactly the same. + # For example, + # - whether to use token_type_embeddings + # - whether to use context expansion + # So only test one (the most widely used) model CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base", architecture="GteNewModel", + mteb_score=0.775074696, hf_overrides={"architectures": ["GteNewModel"]}, enable_test=True), CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, - enable_test=True), + enable_test=False), CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, - enable_test=True), + enable_test=False), ########### Qwen2ForCausalLM LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", + mteb_score=0.758473459018872, architecture="Qwen2ForCausalLM", enable_test=True), ########## ModernBertModel CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base", + mteb_score=0.748193353, architecture="ModernBertModel", enable_test=True), ########## Qwen3ForCausalLM LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", + mteb_score=0.771163695, architecture="Qwen3ForCausalLM", dtype="float32", enable_test=True), @@ -65,10 +75,12 @@ RERANK_MODELS = [ CLSPoolingRerankModelInfo( # classifier_pooling: mean "Alibaba-NLP/gte-reranker-modernbert-base", + mteb_score=0.33386, architecture="ModernBertForSequenceClassification", enable_test=True), CLSPoolingRerankModelInfo( "Alibaba-NLP/gte-multilingual-reranker-base", + mteb_score=0.33062, architecture="GteNewForSequenceClassification", hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, enable_test=True), @@ -78,10 +90,6 @@ RERANK_MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - check_transformers_version(model_info.name, - max_transformers_version="4.53.2") - mteb_test_embed_models(hf_runner, vllm_runner, model_info) @@ -89,10 +97,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner, def test_embed_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts) -> None: - if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - check_transformers_version(model_info.name, - max_transformers_version="4.53.2") - correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts) diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py index 6cae53a660ad8..bc95475836e87 100644 --- a/tests/models/language/pooling/test_intfloat.py +++ b/tests/models/language/pooling/test_intfloat.py @@ -10,6 +10,7 @@ MODELS = [ ########## BertModel CLSPoolingEmbedModelInfo("intfloat/e5-small", architecture="BertModel", + mteb_score=0.742285423, enable_test=True), CLSPoolingEmbedModelInfo("intfloat/e5-base", architecture="BertModel", @@ -23,6 +24,7 @@ MODELS = [ ########## XLMRobertaModel CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base", architecture="XLMRobertaModel", + mteb_score=0.779325955, enable_test=True), CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large", architecture="XLMRobertaModel", @@ -36,7 +38,7 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02) + mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 37c5bdc97dd98..c4e4835556a54 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -14,6 +14,7 @@ from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models EMBEDDING_MODELS = [ CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3", + mteb_score=0.824413164, architecture="XLMRobertaModel", is_matryoshka=True) ] @@ -21,6 +22,7 @@ EMBEDDING_MODELS = [ RERANK_MODELS = [ CLSPoolingRerankModelInfo( "jinaai/jina-reranker-v2-base-multilingual", + mteb_score=0.33643, architecture="XLMRobertaForSequenceClassification") ] diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py index 73823deeff4e0..1731c6ae6fff7 100644 --- a/tests/models/language/pooling/test_mxbai_rerank.py +++ b/tests/models/language/pooling/test_mxbai_rerank.py @@ -20,6 +20,7 @@ RERANK_MODELS = [ LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, + mteb_score=0.273, enable_test=True), LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2", architecture="Qwen2ForSequenceClassification", diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 2d05958e9bcda..52a8ce6e6671f 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -10,6 +10,7 @@ from .mteb_utils import mteb_test_embed_models MODELS = [ CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1", architecture="NomicBertModel", + mteb_score=0.737568559, enable_test=True), CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5", architecture="NomicBertModel", @@ -19,6 +20,7 @@ MODELS = [ enable_test=False), CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", architecture="NomicBertModel", + mteb_score=0.715488912, enable_test=True) ] diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py index 5dd2d9eae9115..ebdacf9d0c673 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling/test_qwen3_reranker.py @@ -20,6 +20,7 @@ qwen3_reranker_hf_overrides = { RERANK_MODELS = [ LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B", architecture="Qwen3ForSequenceClassification", + mteb_score=0.25736, hf_overrides=qwen3_reranker_hf_overrides, enable_test=True), LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B", diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index c22c78592e535..864f3d75ef5aa 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -11,6 +11,7 @@ MODELS = [ CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", is_matryoshka=False, architecture="BertModel", + mteb_score=0.714927797, enable_test=True), CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s", is_matryoshka=False, @@ -23,6 +24,7 @@ MODELS = [ CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long", is_matryoshka=False, architecture="NomicBertModel", + mteb_score=0.681146831, enable_test=True), CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l", is_matryoshka=False, @@ -31,14 +33,17 @@ MODELS = [ CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", is_matryoshka=True, architecture="BertModel", + mteb_score=0.649088363, enable_test=True), CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0", is_matryoshka=True, architecture="XLMRobertaModel", + mteb_score=0.712258299, enable_test=True), CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0", is_matryoshka=True, architecture="GteModel", + mteb_score=0.706622444, enable_test=True), ] @@ -46,7 +51,7 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02) + mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) diff --git a/tests/models/language/pooling/test_st_projector.py b/tests/models/language/pooling/test_st_projector.py index 51ddbcc5ab249..bafeb4060d80a 100644 --- a/tests/models/language/pooling/test_st_projector.py +++ b/tests/models/language/pooling/test_st_projector.py @@ -10,6 +10,7 @@ ST_PROJECTOR_MODELS = [ CLSPoolingEmbedModelInfo( "TencentBAC/Conan-embedding-v1", architecture="BertModel", + mteb_score=0.688611955, enable_test=True, ), ] diff --git a/tests/models/utils.py b/tests/models/utils.py index 0fb1f5b3753b5..40a41afff8287 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -347,6 +347,7 @@ class ModelInfo: dtype: str = "auto" hf_overrides: Optional[dict[str, Any]] = None default_pooling_type: str = "" + mteb_score: Optional[float] = None enable_test: bool = True