[Misc] Update embedding/cross encoder tests to use mteb v2 (#27329)

Signed-off-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
Roman Solomatin 2025-11-19 09:28:40 +03:00 committed by GitHub
parent 3d4e7d34be
commit 71d0ae1c54
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 144 additions and 84 deletions

View File

@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released # TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.1 transformers==4.57.1
tokenizers==0.22.0 tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test. schemathesis>=3.39.15 # Required for openai schema test.

View File

@ -201,8 +201,6 @@ email-validator==2.2.0
# via pydantic # via pydantic
encodec==0.1.1 encodec==0.1.1
# via vocos # via vocos
eval-type-backport==0.2.2
# via mteb
evaluate==0.4.3 evaluate==0.4.3
# via lm-eval # via lm-eval
fastapi==0.116.1 fastapi==0.116.1
@ -490,7 +488,7 @@ msgpack==1.1.0
# via # via
# librosa # librosa
# ray # ray
mteb==1.38.11 mteb==2.1.2
# via -r requirements/test.in # via -r requirements/test.in
multidict==6.1.0 multidict==6.1.0
# via # via

View File

@ -2,12 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile import tempfile
from collections.abc import Sequence
import mteb import mteb
import numpy as np import numpy as np
import requests import requests
import torch import torch
from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs import tests.ci_envs as ci_envs
from tests.models.utils import ( from tests.models.utils import (
@ -27,24 +29,47 @@ MTEB_EMBED_TOL = 1e-4
# See #19344 # See #19344
MTEB_RERANK_TASKS = ["NFCorpus"] MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"] MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3 MTEB_RERANK_TOL = 2e-3
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
revision="1",
release_date=None,
languages=None,
framework=[],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
modalities=["text"], # 'image' can be added to evaluate multimodal models
)
class VllmMtebEncoder(mteb.EncoderProtocol):
mteb_model_meta = _empty_model_meta
class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model): def __init__(self, vllm_model):
super().__init__()
self.llm = vllm_model self.llm = vllm_model
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
def encode( def encode(
self, self,
sentences: Sequence[str], inputs: DataLoader[mteb.types.BatchedInput],
*args, *args,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
# Hoping to discover potential scheduling # Hoping to discover potential scheduling
# issues by randomizing the order. # issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences)) r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r] sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False) outputs = self.llm.embed(sentences, use_tqdm=False)
@ -52,36 +77,70 @@ class VllmMtebEncoder(mteb.Encoder):
embeds = embeds[np.argsort(r)] embeds = embeds[np.argsort(r)]
return embeds return embeds
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def predict( def predict(
self, self,
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args, *args,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
r = self.rng.permutation(len(sentences)) queries = [text for batch in inputs1 for text in batch["text"]]
sentences = [sentences[i] for i in r] corpus = [text for batch in inputs2 for text in batch["text"]]
queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]
outputs = self.llm.score( outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
) )
scores = np.array(outputs) scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores return scores
class OpenAIClientMtebEncoder(mteb.Encoder): class OpenAIClientMtebEncoder(VllmMtebEncoder):
def __init__(self, model_name: str, client): def __init__(self, model_name: str, client):
super().__init__()
self.model_name = model_name self.model_name = model_name
self.client = client self.client = client
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray: def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling # Hoping to discover potential scheduling
# issues by randomizing the order. # issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences)) r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r] sentences = [sentences[i] for i in r]
@ -94,28 +153,29 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
return embeds return embeds
class ScoreClientMtebEncoder(mteb.Encoder): class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url): def __init__(self, model_name: str, url):
super().__init__()
self.model_name = model_name self.model_name = model_name
self.url = url self.url = url
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
def predict( def predict(
self, self,
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args, *args,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
r = self.rng.permutation(len(sentences)) queries = [text for batch in inputs1 for text in batch["text"]]
sentences = [sentences[i] for i in r] full_corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = [] outputs = []
for query, corpus, prompt in sentences: for query, corpus in zip(queries, full_corpus):
outputs.append(self.get_score(query, corpus)) outputs.append(self.get_score(query, corpus))
scores = np.array(outputs) scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores return scores
def get_score(self, query, corpus): def get_score(self, query, corpus):
@ -145,16 +205,13 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
return response["results"][0]["relevance_score"] return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder, tasks): def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks) tasks = mteb.get_tasks(tasks=tasks)
evaluation = mteb.MTEB(tasks=tasks) results = mteb.evaluate(
results = evaluation.run(
encoder, encoder,
verbosity=0, tasks,
output_folder=None, cache=None,
encode_kwargs={ show_progress_bar=False,
"show_progress_bar": False,
},
) )
main_score = results[0].scores["test"][0]["main_score"] main_score = results[0].scores["test"][0]["main_score"]
@ -244,33 +301,39 @@ def mteb_test_embed_models(
assert st_main_score - vllm_main_score < atol assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder, tasks, languages): def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as results_folder: with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s") bm25s = mteb.get_model("bm25s")
tasks = mteb.get_tasks(tasks=tasks, languages=languages)
subset = "default"
eval_splits = ["test"] eval_splits = ["test"]
evaluation = mteb.MTEB(tasks=tasks) mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
evaluation.run( tasks=tasks, languages=languages, eval_splits=eval_splits
bm25s,
verbosity=0,
eval_splits=eval_splits,
save_predictions=True,
output_folder=f"{results_folder}/stage1",
encode_kwargs={"show_progress_bar": False},
) )
results = evaluation.run( mteb.evaluate(
bm25s,
mteb_tasks,
prediction_folder=prediction_folder,
show_progress_bar=False,
# don't save results for test runs
cache=None,
overwrite_strategy="always",
)
second_stage_tasks = []
for task in mteb_tasks:
second_stage_tasks.append(
task.convert_to_reranking(
prediction_folder,
top_k=10,
)
)
results = mteb.evaluate(
cross_encoder, cross_encoder,
verbosity=0, second_stage_tasks,
eval_splits=eval_splits, show_progress_bar=False,
top_k=10, cache=None,
save_predictions=True,
output_folder=f"{results_folder}/stage2",
previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
encode_kwargs={"show_progress_bar": False},
) )
main_score = results[0].scores["test"][0]["main_score"] main_score = results[0].scores["test"][0]["main_score"]
return main_score return main_score
@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf(
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
): ):
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model: with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
original_predict = hf_model.predict
def _predict(
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
*args,
**kwargs,
):
# vllm and st both remove the prompt, fair comparison.
prompts = [(s[0], s[1]) for s in sentences]
return original_predict(prompts, *args, **kwargs, batch_size=8)
hf_model.predict = _predict
hf_model.original_predict = original_predict
if hf_model_callback is not None: if hf_model_callback is not None:
hf_model_callback(hf_model) hf_model_callback(hf_model)
@ -310,7 +359,7 @@ def mteb_test_rerank_models(
model_info: RerankModelInfo, model_info: RerankModelInfo,
vllm_extra_kwargs=None, vllm_extra_kwargs=None,
hf_model_callback=None, hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder, vllm_mteb_encoder=VllmMtebCrossEncoder,
atol=MTEB_RERANK_TOL, atol=MTEB_RERANK_TOL,
): ):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)

View File

@ -2,13 +2,15 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any from typing import Any
import mteb
import numpy as np import numpy as np
import pytest import pytest
import torch import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.language.pooling_mteb_test.mteb_utils import (
VllmMtebEncoder, VllmMtebCrossEncoder,
mteb_test_rerank_models, mteb_test_rerank_models,
) )
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
@ -103,7 +105,7 @@ class GemmaRerankerHfRunner(HfRunner):
return torch.Tensor(scores) return torch.Tensor(scores)
class GemmaMtebEncoder(VllmMtebEncoder): class GemmaMtebEncoder(VllmMtebCrossEncoder):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.query_template = "A: {query}\n" self.query_template = "A: {query}\n"
@ -111,17 +113,26 @@ class GemmaMtebEncoder(VllmMtebEncoder):
def predict( def predict(
self, self,
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args, *args,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
_sentences = [] queries = [
for query, corpus, prompt in sentences: self.query_template.format(query=text)
query = self.query_template.format(query=query) for batch in inputs1
corpus = self.document_template.format(doc=corpus, prompt=PROMPT) for text in batch["text"]
_sentences.append((query, corpus, prompt)) ]
corpus = [
return super().predict(_sentences, *args, **kwargs) self.document_template.format(doc=text, prompt=PROMPT)
for batch in inputs2
for text in batch["text"]
]
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
return scores
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)

View File

@ -70,8 +70,9 @@ class MxbaiRerankerHfRunner(HfRunner):
return scores return scores
scores = [] scores = []
for prompt in prompts: for query, doc, *_ in prompts:
inputs = process_inputs([prompt]) pairs = [(query, doc)]
inputs = process_inputs(pairs)
score = compute_logits(inputs) score = compute_logits(inputs)
scores.append(score[0].item()) scores.append(score[0].item())
return torch.Tensor(scores) return torch.Tensor(scores)

View File

@ -72,8 +72,9 @@ class Qwen3RerankerHfRunner(HfRunner):
return scores return scores
scores = [] scores = []
for prompt in prompts: for query, doc, *_ in prompts:
inputs = process_inputs([prompt]) pairs = [(query, doc)]
inputs = process_inputs(pairs)
score = compute_logits(inputs) score = compute_logits(inputs)
scores.append(score[0].item()) scores.append(score[0].item())
return torch.Tensor(scores) return torch.Tensor(scores)