[Misc] Update embedding/cross encoder tests to use mteb v2 (#27329)

Signed-off-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
Roman Solomatin 2025-11-19 09:28:40 +03:00 committed by GitHub
parent 3d4e7d34be
commit 71d0ae1c54
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 144 additions and 84 deletions

View File

@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.1
tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test.

View File

@ -201,8 +201,6 @@ email-validator==2.2.0
# via pydantic
encodec==0.1.1
# via vocos
eval-type-backport==0.2.2
# via mteb
evaluate==0.4.3
# via lm-eval
fastapi==0.116.1
@ -490,7 +488,7 @@ msgpack==1.1.0
# via
# librosa
# ray
mteb==1.38.11
mteb==2.1.2
# via -r requirements/test.in
multidict==6.1.0
# via

View File

@ -2,12 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
from collections.abc import Sequence
import mteb
import numpy as np
import requests
import torch
from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import (
@ -27,24 +29,47 @@ MTEB_EMBED_TOL = 1e-4
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"]
MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
revision="1",
release_date=None,
languages=None,
framework=[],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
modalities=["text"], # 'image' can be added to evaluate multimodal models
)
class VllmMtebEncoder(mteb.EncoderProtocol):
mteb_model_meta = _empty_model_meta
class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model):
super().__init__()
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
sentences: Sequence[str],
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
@ -52,36 +77,70 @@ class VllmMtebEncoder(mteb.Encoder):
embeds = embeds[np.argsort(r)]
return embeds
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def predict(
self,
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores
class OpenAIClientMtebEncoder(mteb.Encoder):
class OpenAIClientMtebEncoder(VllmMtebEncoder):
def __init__(self, model_name: str, client):
super().__init__()
self.model_name = model_name
self.client = client
self.rng = np.random.default_rng(seed=42)
def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
@ -94,28 +153,29 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
return embeds
class ScoreClientMtebEncoder(mteb.Encoder):
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url):
super().__init__()
self.model_name = model_name
self.url = url
self.rng = np.random.default_rng(seed=42)
def predict(
self,
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
queries = [text for batch in inputs1 for text in batch["text"]]
full_corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = []
for query, corpus, prompt in sentences:
for query, corpus in zip(queries, full_corpus):
outputs.append(self.get_score(query, corpus))
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores
def get_score(self, query, corpus):
@ -145,16 +205,13 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder, tasks):
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(
results = mteb.evaluate(
encoder,
verbosity=0,
output_folder=None,
encode_kwargs={
"show_progress_bar": False,
},
tasks,
cache=None,
show_progress_bar=False,
)
main_score = results[0].scores["test"][0]["main_score"]
@ -244,33 +301,39 @@ def mteb_test_embed_models(
assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder, tasks, languages):
with tempfile.TemporaryDirectory() as results_folder:
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s")
tasks = mteb.get_tasks(tasks=tasks, languages=languages)
subset = "default"
eval_splits = ["test"]
evaluation = mteb.MTEB(tasks=tasks)
evaluation.run(
bm25s,
verbosity=0,
eval_splits=eval_splits,
save_predictions=True,
output_folder=f"{results_folder}/stage1",
encode_kwargs={"show_progress_bar": False},
mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
tasks=tasks, languages=languages, eval_splits=eval_splits
)
results = evaluation.run(
cross_encoder,
verbosity=0,
eval_splits=eval_splits,
mteb.evaluate(
bm25s,
mteb_tasks,
prediction_folder=prediction_folder,
show_progress_bar=False,
# don't save results for test runs
cache=None,
overwrite_strategy="always",
)
second_stage_tasks = []
for task in mteb_tasks:
second_stage_tasks.append(
task.convert_to_reranking(
prediction_folder,
top_k=10,
save_predictions=True,
output_folder=f"{results_folder}/stage2",
previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
encode_kwargs={"show_progress_bar": False},
)
)
results = mteb.evaluate(
cross_encoder,
second_stage_tasks,
show_progress_bar=False,
cache=None,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf(
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
):
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
original_predict = hf_model.predict
def _predict(
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
*args,
**kwargs,
):
# vllm and st both remove the prompt, fair comparison.
prompts = [(s[0], s[1]) for s in sentences]
return original_predict(prompts, *args, **kwargs, batch_size=8)
hf_model.predict = _predict
hf_model.original_predict = original_predict
if hf_model_callback is not None:
hf_model_callback(hf_model)
@ -310,7 +359,7 @@ def mteb_test_rerank_models(
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder,
vllm_mteb_encoder=VllmMtebCrossEncoder,
atol=MTEB_RERANK_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)

View File

@ -2,13 +2,15 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import mteb
import numpy as np
import pytest
import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import (
VllmMtebEncoder,
VllmMtebCrossEncoder,
mteb_test_rerank_models,
)
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
@ -103,7 +105,7 @@ class GemmaRerankerHfRunner(HfRunner):
return torch.Tensor(scores)
class GemmaMtebEncoder(VllmMtebEncoder):
class GemmaMtebEncoder(VllmMtebCrossEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.query_template = "A: {query}\n"
@ -111,17 +113,26 @@ class GemmaMtebEncoder(VllmMtebEncoder):
def predict(
self,
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
_sentences = []
for query, corpus, prompt in sentences:
query = self.query_template.format(query=query)
corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
_sentences.append((query, corpus, prompt))
return super().predict(_sentences, *args, **kwargs)
queries = [
self.query_template.format(query=text)
for batch in inputs1
for text in batch["text"]
]
corpus = [
self.document_template.format(doc=text, prompt=PROMPT)
for batch in inputs2
for text in batch["text"]
]
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
return scores
@pytest.mark.parametrize("model_info", RERANK_MODELS)

View File

@ -70,8 +70,9 @@ class MxbaiRerankerHfRunner(HfRunner):
return scores
scores = []
for prompt in prompts:
inputs = process_inputs([prompt])
for query, doc, *_ in prompts:
pairs = [(query, doc)]
inputs = process_inputs(pairs)
score = compute_logits(inputs)
scores.append(score[0].item())
return torch.Tensor(scores)

View File

@ -72,8 +72,9 @@ class Qwen3RerankerHfRunner(HfRunner):
return scores
scores = []
for prompt in prompts:
inputs = process_inputs([prompt])
for query, doc, *_ in prompts:
pairs = [(query, doc)]
inputs = process_inputs(pairs)
score = compute_logits(inputs)
scores.append(score[0].item())
return torch.Tensor(scores)