From 71d0ae1c54543689ea7541aa20b9522982b0815e Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Wed, 19 Nov 2025 09:28:40 +0300 Subject: [PATCH] [Misc] Update embedding/cross encoder tests to use `mteb` v2 (#27329) Signed-off-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Co-authored-by: Cyrus Leung Co-authored-by: Isotr0py Co-authored-by: wang.yuqi Co-authored-by: wang.yuqi --- requirements/test.in | 2 +- requirements/test.txt | 4 +- .../language/pooling_mteb_test/mteb_utils.py | 181 +++++++++++------- .../test_bge_reranker_v2_gemma.py | 31 ++- .../pooling_mteb_test/test_mxbai_rerank.py | 5 +- .../pooling_mteb_test/test_qwen3_reranker.py | 5 +- 6 files changed, 144 insertions(+), 84 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 30d97e9b9c7d..05f6bcca5c2c 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test # TODO: Use lm-eval[api]==0.4.10 once released lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test -mteb[bm25s]>=1.38.11, <2 # required for mteb test +mteb[bm25s]>=2, <3 # required for mteb test transformers==4.57.1 tokenizers==0.22.0 schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index 3263b74c0879..bcd511660f85 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -201,8 +201,6 @@ email-validator==2.2.0 # via pydantic encodec==0.1.1 # via vocos -eval-type-backport==0.2.2 - # via mteb evaluate==0.4.3 # via lm-eval fastapi==0.116.1 @@ -490,7 +488,7 @@ msgpack==1.1.0 # via # librosa # ray -mteb==1.38.11 +mteb==2.1.2 # via -r requirements/test.in multidict==6.1.0 # via diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py index 0384ff82790f..189cdbae99dc 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_utils.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import tempfile -from collections.abc import Sequence import mteb import numpy as np import requests import torch +from mteb.models import ModelMeta +from mteb.types import Array +from torch.utils.data import DataLoader import tests.ci_envs as ci_envs from tests.models.utils import ( @@ -27,24 +29,47 @@ MTEB_EMBED_TOL = 1e-4 # See #19344 MTEB_RERANK_TASKS = ["NFCorpus"] -MTEB_RERANK_LANGS = ["en"] +MTEB_RERANK_LANGS = ["eng"] MTEB_RERANK_TOL = 2e-3 +_empty_model_meta = ModelMeta( + loader=None, + name="vllm/model", + revision="1", + release_date=None, + languages=None, + framework=[], + similarity_fn_name=None, + n_parameters=None, + memory_usage_mb=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=None, + public_training_code=None, + public_training_data=None, + use_instructions=None, + training_datasets=None, + modalities=["text"], # 'image' can be added to evaluate multimodal models +) + + +class VllmMtebEncoder(mteb.EncoderProtocol): + mteb_model_meta = _empty_model_meta -class VllmMtebEncoder(mteb.Encoder): def __init__(self, vllm_model): - super().__init__() self.llm = vllm_model self.rng = np.random.default_rng(seed=42) def encode( self, - sentences: Sequence[str], + inputs: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: # Hoping to discover potential scheduling # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] r = self.rng.permutation(len(sentences)) sentences = [sentences[i] for i in r] outputs = self.llm.embed(sentences, use_tqdm=False) @@ -52,36 +77,70 @@ class VllmMtebEncoder(mteb.Encoder): embeds = embeds[np.argsort(r)] return embeds + def similarity( + self, + embeddings1: np.ndarray, + embeddings2: np.ndarray, + ) -> np.ndarray: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T) + return sim + + def similarity_pairwise( + self, + embeddings1: Array, + embeddings2: Array, + ) -> Array: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.sum(embeddings1 * embeddings2, axis=1) / ( + norm1.flatten() * norm2.flatten() + ) + return sim + + +class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): + mteb_model_meta = _empty_model_meta + + def __init__(self, vllm_model): + self.llm = vllm_model + self.rng = np.random.default_rng(seed=42) + def predict( self, - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] - - queries = [s[0] for s in sentences] - corpus = [s[1] for s in sentences] + queries = [text for batch in inputs1 for text in batch["text"]] + corpus = [text for batch in inputs2 for text in batch["text"]] outputs = self.llm.score( queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False ) scores = np.array(outputs) - scores = scores[np.argsort(r)] return scores -class OpenAIClientMtebEncoder(mteb.Encoder): +class OpenAIClientMtebEncoder(VllmMtebEncoder): def __init__(self, model_name: str, client): - super().__init__() self.model_name = model_name self.client = client self.rng = np.random.default_rng(seed=42) - def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray: + def encode( + self, + inputs: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: # Hoping to discover potential scheduling # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] r = self.rng.permutation(len(sentences)) sentences = [sentences[i] for i in r] @@ -94,28 +153,29 @@ class OpenAIClientMtebEncoder(mteb.Encoder): return embeds -class ScoreClientMtebEncoder(mteb.Encoder): +class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): + mteb_model_meta = _empty_model_meta + def __init__(self, model_name: str, url): - super().__init__() self.model_name = model_name self.url = url self.rng = np.random.default_rng(seed=42) def predict( self, - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] + queries = [text for batch in inputs1 for text in batch["text"]] + full_corpus = [text for batch in inputs2 for text in batch["text"]] outputs = [] - for query, corpus, prompt in sentences: + for query, corpus in zip(queries, full_corpus): outputs.append(self.get_score(query, corpus)) scores = np.array(outputs) - scores = scores[np.argsort(r)] return scores def get_score(self, query, corpus): @@ -145,16 +205,13 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder): return response["results"][0]["relevance_score"] -def run_mteb_embed_task(encoder, tasks): +def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks): tasks = mteb.get_tasks(tasks=tasks) - evaluation = mteb.MTEB(tasks=tasks) - results = evaluation.run( + results = mteb.evaluate( encoder, - verbosity=0, - output_folder=None, - encode_kwargs={ - "show_progress_bar": False, - }, + tasks, + cache=None, + show_progress_bar=False, ) main_score = results[0].scores["test"][0]["main_score"] @@ -244,33 +301,39 @@ def mteb_test_embed_models( assert st_main_score - vllm_main_score < atol -def run_mteb_rerank(cross_encoder, tasks, languages): - with tempfile.TemporaryDirectory() as results_folder: +def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): + with tempfile.TemporaryDirectory() as prediction_folder: bm25s = mteb.get_model("bm25s") - tasks = mteb.get_tasks(tasks=tasks, languages=languages) - - subset = "default" eval_splits = ["test"] - evaluation = mteb.MTEB(tasks=tasks) - evaluation.run( - bm25s, - verbosity=0, - eval_splits=eval_splits, - save_predictions=True, - output_folder=f"{results_folder}/stage1", - encode_kwargs={"show_progress_bar": False}, + mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks( + tasks=tasks, languages=languages, eval_splits=eval_splits ) - results = evaluation.run( + mteb.evaluate( + bm25s, + mteb_tasks, + prediction_folder=prediction_folder, + show_progress_bar=False, + # don't save results for test runs + cache=None, + overwrite_strategy="always", + ) + + second_stage_tasks = [] + for task in mteb_tasks: + second_stage_tasks.append( + task.convert_to_reranking( + prediction_folder, + top_k=10, + ) + ) + + results = mteb.evaluate( cross_encoder, - verbosity=0, - eval_splits=eval_splits, - top_k=10, - save_predictions=True, - output_folder=f"{results_folder}/stage2", - previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json", - encode_kwargs={"show_progress_bar": False}, + second_stage_tasks, + show_progress_bar=False, + cache=None, ) main_score = results[0].scores["test"][0]["main_score"] return main_score @@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf( hf_runner, model_name, hf_dtype="float32", hf_model_callback=None ): with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model: - original_predict = hf_model.predict - - def _predict( - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt - *args, - **kwargs, - ): - # vllm and st both remove the prompt, fair comparison. - prompts = [(s[0], s[1]) for s in sentences] - return original_predict(prompts, *args, **kwargs, batch_size=8) - - hf_model.predict = _predict - hf_model.original_predict = original_predict - if hf_model_callback is not None: hf_model_callback(hf_model) @@ -310,7 +359,7 @@ def mteb_test_rerank_models( model_info: RerankModelInfo, vllm_extra_kwargs=None, hf_model_callback=None, - vllm_mteb_encoder=VllmMtebEncoder, + vllm_mteb_encoder=VllmMtebCrossEncoder, atol=MTEB_RERANK_TOL, ): vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index 2927a3711136..6b2e46964492 100644 --- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -2,13 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any +import mteb import numpy as np import pytest import torch +from torch.utils.data import DataLoader from tests.conftest import HfRunner from tests.models.language.pooling_mteb_test.mteb_utils import ( - VllmMtebEncoder, + VllmMtebCrossEncoder, mteb_test_rerank_models, ) from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo @@ -103,7 +105,7 @@ class GemmaRerankerHfRunner(HfRunner): return torch.Tensor(scores) -class GemmaMtebEncoder(VllmMtebEncoder): +class GemmaMtebEncoder(VllmMtebCrossEncoder): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.query_template = "A: {query}\n" @@ -111,17 +113,26 @@ class GemmaMtebEncoder(VllmMtebEncoder): def predict( self, - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - _sentences = [] - for query, corpus, prompt in sentences: - query = self.query_template.format(query=query) - corpus = self.document_template.format(doc=corpus, prompt=PROMPT) - _sentences.append((query, corpus, prompt)) - - return super().predict(_sentences, *args, **kwargs) + queries = [ + self.query_template.format(query=text) + for batch in inputs1 + for text in batch["text"] + ] + corpus = [ + self.document_template.format(doc=text, prompt=PROMPT) + for batch in inputs2 + for text in batch["text"] + ] + outputs = self.llm.score( + queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False + ) + scores = np.array(outputs) + return scores @pytest.mark.parametrize("model_info", RERANK_MODELS) diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py index fd04dc199023..a6f2a89b268f 100644 --- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py +++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py @@ -70,8 +70,9 @@ class MxbaiRerankerHfRunner(HfRunner): return scores scores = [] - for prompt in prompts: - inputs = process_inputs([prompt]) + for query, doc, *_ in prompts: + pairs = [(query, doc)] + inputs = process_inputs(pairs) score = compute_logits(inputs) scores.append(score[0].item()) return torch.Tensor(scores) diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py index 00e99f44cfdb..9a1be6c0be1d 100644 --- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py +++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py @@ -72,8 +72,9 @@ class Qwen3RerankerHfRunner(HfRunner): return scores scores = [] - for prompt in prompts: - inputs = process_inputs([prompt]) + for query, doc, *_ in prompts: + pairs = [(query, doc)] + inputs = process_inputs(pairs) score = compute_logits(inputs) scores.append(score[0].item()) return torch.Tensor(scores)