[CI] Accelerate mteb test by setting SentenceTransformers mteb score to a constant (#24088)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi 2025-09-03 17:23:56 +08:00 committed by GitHub
parent 9c99e4871f
commit 51383bd472
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 83 additions and 52 deletions

View File

@ -37,4 +37,6 @@ def test_mteb_embed(server):
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < MTEB_EMBED_TOL

View File

@ -6,16 +6,19 @@ import pytest
# yapf conflicts with isort for this block
# yapf: disable
from tests.models.language.pooling.mteb_utils import (
MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
RerankClientMtebEncoder, ScoreClientMtebEncoder,
mteb_test_rerank_models_hf, run_mteb_rerank)
from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS,
MTEB_RERANK_TOL,
RerankClientMtebEncoder,
ScoreClientMtebEncoder,
run_mteb_rerank)
# yapf: enable
from tests.utils import RemoteOpenAIServer
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
st_main_score = 0.33457
@pytest.fixture(scope="module")
@ -29,15 +32,7 @@ def server():
yield remote_server
@pytest.fixture(scope="module")
def st_main_score(hf_runner):
# The main score related to the version of the dependency.
# So we need to recalculate every time.
main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
return main_score
def test_mteb_score(server, st_main_score):
def test_mteb_score(server):
url = server.url_for("score")
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@ -47,10 +42,12 @@ def test_mteb_score(server, st_main_score):
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
def test_mteb_rerank(server, st_main_score):
def test_mteb_rerank(server):
url = server.url_for("rerank")
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@ -60,4 +57,6 @@ def test_mteb_rerank(server, st_main_score):
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < MTEB_RERANK_TOL

View File

@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner,
example_prompts,
vllm_extra_kwargs=None,
hf_model_callback=None):
if not model_info.enable_test:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest.skip("Skipping test.")
pytest.skip("Debug only, ci prefers to use mteb test.")
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"

View File

@ -18,7 +18,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 0.02
MTEB_EMBED_TOL = 1e-4
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
@ -192,22 +192,28 @@ def mteb_test_embed_models(hf_runner,
MTEB_EMBED_TASKS)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
with hf_runner(model_info.name,
is_sentence_transformer=True,
dtype="float32") as hf_model:
if model_info.mteb_score is None:
with hf_runner(model_info.name,
is_sentence_transformer=True,
dtype="float32") as hf_model:
if hf_model_callback is not None:
hf_model_callback(hf_model)
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", vllm_dtype, vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder, tasks, languages):
@ -310,12 +316,18 @@ def mteb_test_rerank_models(hf_runner,
languages=MTEB_RERANK_LANGS)
vllm_dtype = model_config.dtype
st_main_score, st_dtype = mteb_test_rerank_models_hf(
hf_runner, model_info.name, hf_model_callback)
if model_info.mteb_score is None:
st_main_score, st_dtype = mteb_test_rerank_models_hf(
hf_runner, model_info.name, hf_model_callback)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", vllm_dtype, vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol

View File

@ -12,6 +12,7 @@ MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
architecture="BertModel",
mteb_score=0.779336792,
enable_test=True),
CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
architecture="BertModel",
@ -52,10 +53,12 @@ MODELS = [
########## XLMRobertaModel
CLSPoolingEmbedModelInfo("BAAI/bge-m3",
architecture="XLMRobertaModel",
mteb_score=0.787343078,
enable_test=True),
########## Qwen2Model
LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
architecture="Qwen2Model",
mteb_score=0.75724465,
dtype="float32",
enable_test=True),
]
@ -65,6 +68,7 @@ RERANK_MODELS = [
CLSPoolingRerankModelInfo(
"BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398,
enable_test=True),
CLSPoolingRerankModelInfo(
"BAAI/bge-reranker-large",

View File

@ -104,7 +104,6 @@ class GemmaMtebEncoder(VllmMtebEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.prompt = PROMPT
self.query_template = "A: {query}\n"
self.document_template = "B: {doc}\n{prompt}"
@ -119,7 +118,7 @@ class GemmaMtebEncoder(VllmMtebEncoder):
_sentences = []
for query, corpus, prompt in sentences:
query = self.query_template.format(query=query)
corpus = self.document_template.format(doc=corpus, prompt=prompt)
corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
_sentences.append((query, corpus, prompt))
return super().predict(_sentences, *args, **kwargs)

View File

@ -8,8 +8,10 @@ from .mteb_utils import mteb_test_rerank_models
RERANK_MODELS = [
CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification"),
LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification")
]

View File

@ -7,7 +7,7 @@ import pytest
from vllm.config import PoolerConfig
from vllm.platforms import current_platform
from ...utils import check_embeddings_close, check_transformers_version
from ...utils import check_embeddings_close
@pytest.mark.parametrize(
@ -30,7 +30,6 @@ from ...utils import check_embeddings_close, check_transformers_version
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"),
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
# [Cross-Encoder]
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
],
@ -42,8 +41,6 @@ def test_models(
model,
monkeypatch,
) -> None:
if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
check_transformers_version(model, max_transformers_version="4.53.2")
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention

View File

@ -5,13 +5,14 @@ import pytest
from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
EmbedModelInfo, LASTPoolingEmbedModelInfo,
RerankModelInfo, check_transformers_version)
RerankModelInfo)
from .embed_utils import correctness_test_embed_models
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("thenlper/gte-large",
mteb_score=0.76807651,
architecture="BertModel",
enable_test=True),
CLSPoolingEmbedModelInfo("thenlper/gte-base",
@ -30,28 +31,37 @@ MODELS = [
architecture="BertModel",
enable_test=False),
########### NewModel
# These three architectures are almost the same, but not exactly the same.
# For example,
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True),
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True),
enable_test=False),
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True),
enable_test=False),
########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM",
enable_test=True),
########## ModernBertModel
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353,
architecture="ModernBertModel",
enable_test=True),
########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695,
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=True),
@ -65,10 +75,12 @@ RERANK_MODELS = [
CLSPoolingRerankModelInfo(
# classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386,
architecture="ModernBertForSequenceClassification",
enable_test=True),
CLSPoolingRerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062,
architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
enable_test=True),
@ -78,10 +90,6 @@ RERANK_MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
check_transformers_version(model_info.name,
max_transformers_version="4.53.2")
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@ -89,10 +97,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
check_transformers_version(model_info.name,
max_transformers_version="4.53.2")
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)

View File

@ -10,6 +10,7 @@ MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("intfloat/e5-small",
architecture="BertModel",
mteb_score=0.742285423,
enable_test=True),
CLSPoolingEmbedModelInfo("intfloat/e5-base",
architecture="BertModel",
@ -23,6 +24,7 @@ MODELS = [
########## XLMRobertaModel
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
mteb_score=0.779325955,
enable_test=True),
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
@ -36,7 +38,7 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)

View File

@ -14,6 +14,7 @@ from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
mteb_score=0.824413164,
architecture="XLMRobertaModel",
is_matryoshka=True)
]
@ -21,6 +22,7 @@ EMBEDDING_MODELS = [
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification")
]

View File

@ -20,6 +20,7 @@ RERANK_MODELS = [
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273,
enable_test=True),
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",

View File

@ -10,6 +10,7 @@ from .mteb_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
mteb_score=0.737568559,
enable_test=True),
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
@ -19,6 +20,7 @@ MODELS = [
enable_test=False),
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
mteb_score=0.715488912,
enable_test=True)
]

View File

@ -20,6 +20,7 @@ qwen3_reranker_hf_overrides = {
RERANK_MODELS = [
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=True),
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",

View File

@ -11,6 +11,7 @@ MODELS = [
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False,
architecture="BertModel",
mteb_score=0.714927797,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False,
@ -23,6 +24,7 @@ MODELS = [
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False,
architecture="NomicBertModel",
mteb_score=0.681146831,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False,
@ -31,14 +33,17 @@ MODELS = [
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
architecture="BertModel",
mteb_score=0.649088363,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True,
architecture="XLMRobertaModel",
mteb_score=0.712258299,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True,
architecture="GteModel",
mteb_score=0.706622444,
enable_test=True),
]
@ -46,7 +51,7 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)

View File

@ -10,6 +10,7 @@ ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo(
"TencentBAC/Conan-embedding-v1",
architecture="BertModel",
mteb_score=0.688611955,
enable_test=True,
),
]

View File

@ -347,6 +347,7 @@ class ModelInfo:
dtype: str = "auto"
hf_overrides: Optional[dict[str, Any]] = None
default_pooling_type: str = ""
mteb_score: Optional[float] = None
enable_test: bool = True