mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 12:35:40 +08:00
[Misc] Update embedding/cross encoder tests to use mteb v2 (#27329)
Signed-off-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
parent
3d4e7d34be
commit
71d0ae1c54
@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
# TODO: Use lm-eval[api]==0.4.10 once released
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
||||
mteb[bm25s]>=1.38.11, <2 # required for mteb test
|
||||
mteb[bm25s]>=2, <3 # required for mteb test
|
||||
transformers==4.57.1
|
||||
tokenizers==0.22.0
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
|
||||
@ -201,8 +201,6 @@ email-validator==2.2.0
|
||||
# via pydantic
|
||||
encodec==0.1.1
|
||||
# via vocos
|
||||
eval-type-backport==0.2.2
|
||||
# via mteb
|
||||
evaluate==0.4.3
|
||||
# via lm-eval
|
||||
fastapi==0.116.1
|
||||
@ -490,7 +488,7 @@ msgpack==1.1.0
|
||||
# via
|
||||
# librosa
|
||||
# ray
|
||||
mteb==1.38.11
|
||||
mteb==2.1.2
|
||||
# via -r requirements/test.in
|
||||
multidict==6.1.0
|
||||
# via
|
||||
|
||||
@ -2,12 +2,14 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import tempfile
|
||||
from collections.abc import Sequence
|
||||
|
||||
import mteb
|
||||
import numpy as np
|
||||
import requests
|
||||
import torch
|
||||
from mteb.models import ModelMeta
|
||||
from mteb.types import Array
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import tests.ci_envs as ci_envs
|
||||
from tests.models.utils import (
|
||||
@ -27,24 +29,47 @@ MTEB_EMBED_TOL = 1e-4
|
||||
|
||||
# See #19344
|
||||
MTEB_RERANK_TASKS = ["NFCorpus"]
|
||||
MTEB_RERANK_LANGS = ["en"]
|
||||
MTEB_RERANK_LANGS = ["eng"]
|
||||
MTEB_RERANK_TOL = 2e-3
|
||||
|
||||
_empty_model_meta = ModelMeta(
|
||||
loader=None,
|
||||
name="vllm/model",
|
||||
revision="1",
|
||||
release_date=None,
|
||||
languages=None,
|
||||
framework=[],
|
||||
similarity_fn_name=None,
|
||||
n_parameters=None,
|
||||
memory_usage_mb=None,
|
||||
max_tokens=None,
|
||||
embed_dim=None,
|
||||
license=None,
|
||||
open_weights=None,
|
||||
public_training_code=None,
|
||||
public_training_data=None,
|
||||
use_instructions=None,
|
||||
training_datasets=None,
|
||||
modalities=["text"], # 'image' can be added to evaluate multimodal models
|
||||
)
|
||||
|
||||
|
||||
class VllmMtebEncoder(mteb.EncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
class VllmMtebEncoder(mteb.Encoder):
|
||||
def __init__(self, vllm_model):
|
||||
super().__init__()
|
||||
self.llm = vllm_model
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def encode(
|
||||
self,
|
||||
sentences: Sequence[str],
|
||||
inputs: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
# Hoping to discover potential scheduling
|
||||
# issues by randomizing the order.
|
||||
sentences = [text for batch in inputs for text in batch["text"]]
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
outputs = self.llm.embed(sentences, use_tqdm=False)
|
||||
@ -52,36 +77,70 @@ class VllmMtebEncoder(mteb.Encoder):
|
||||
embeds = embeds[np.argsort(r)]
|
||||
return embeds
|
||||
|
||||
def similarity(
|
||||
self,
|
||||
embeddings1: np.ndarray,
|
||||
embeddings2: np.ndarray,
|
||||
) -> np.ndarray:
|
||||
# Cosine similarity
|
||||
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
|
||||
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
|
||||
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
|
||||
return sim
|
||||
|
||||
def similarity_pairwise(
|
||||
self,
|
||||
embeddings1: Array,
|
||||
embeddings2: Array,
|
||||
) -> Array:
|
||||
# Cosine similarity
|
||||
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
|
||||
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
|
||||
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
|
||||
norm1.flatten() * norm2.flatten()
|
||||
)
|
||||
return sim
|
||||
|
||||
|
||||
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
def __init__(self, vllm_model):
|
||||
self.llm = vllm_model
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
|
||||
queries = [s[0] for s in sentences]
|
||||
corpus = [s[1] for s in sentences]
|
||||
queries = [text for batch in inputs1 for text in batch["text"]]
|
||||
corpus = [text for batch in inputs2 for text in batch["text"]]
|
||||
|
||||
outputs = self.llm.score(
|
||||
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
|
||||
)
|
||||
scores = np.array(outputs)
|
||||
scores = scores[np.argsort(r)]
|
||||
return scores
|
||||
|
||||
|
||||
class OpenAIClientMtebEncoder(mteb.Encoder):
|
||||
class OpenAIClientMtebEncoder(VllmMtebEncoder):
|
||||
def __init__(self, model_name: str, client):
|
||||
super().__init__()
|
||||
self.model_name = model_name
|
||||
self.client = client
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
|
||||
def encode(
|
||||
self,
|
||||
inputs: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
# Hoping to discover potential scheduling
|
||||
# issues by randomizing the order.
|
||||
sentences = [text for batch in inputs for text in batch["text"]]
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
|
||||
@ -94,28 +153,29 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
|
||||
return embeds
|
||||
|
||||
|
||||
class ScoreClientMtebEncoder(mteb.Encoder):
|
||||
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
def __init__(self, model_name: str, url):
|
||||
super().__init__()
|
||||
self.model_name = model_name
|
||||
self.url = url
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
queries = [text for batch in inputs1 for text in batch["text"]]
|
||||
full_corpus = [text for batch in inputs2 for text in batch["text"]]
|
||||
|
||||
outputs = []
|
||||
for query, corpus, prompt in sentences:
|
||||
for query, corpus in zip(queries, full_corpus):
|
||||
outputs.append(self.get_score(query, corpus))
|
||||
|
||||
scores = np.array(outputs)
|
||||
scores = scores[np.argsort(r)]
|
||||
return scores
|
||||
|
||||
def get_score(self, query, corpus):
|
||||
@ -145,16 +205,13 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
|
||||
return response["results"][0]["relevance_score"]
|
||||
|
||||
|
||||
def run_mteb_embed_task(encoder, tasks):
|
||||
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
|
||||
tasks = mteb.get_tasks(tasks=tasks)
|
||||
evaluation = mteb.MTEB(tasks=tasks)
|
||||
results = evaluation.run(
|
||||
results = mteb.evaluate(
|
||||
encoder,
|
||||
verbosity=0,
|
||||
output_folder=None,
|
||||
encode_kwargs={
|
||||
"show_progress_bar": False,
|
||||
},
|
||||
tasks,
|
||||
cache=None,
|
||||
show_progress_bar=False,
|
||||
)
|
||||
|
||||
main_score = results[0].scores["test"][0]["main_score"]
|
||||
@ -244,33 +301,39 @@ def mteb_test_embed_models(
|
||||
assert st_main_score - vllm_main_score < atol
|
||||
|
||||
|
||||
def run_mteb_rerank(cross_encoder, tasks, languages):
|
||||
with tempfile.TemporaryDirectory() as results_folder:
|
||||
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
|
||||
with tempfile.TemporaryDirectory() as prediction_folder:
|
||||
bm25s = mteb.get_model("bm25s")
|
||||
tasks = mteb.get_tasks(tasks=tasks, languages=languages)
|
||||
|
||||
subset = "default"
|
||||
eval_splits = ["test"]
|
||||
|
||||
evaluation = mteb.MTEB(tasks=tasks)
|
||||
evaluation.run(
|
||||
bm25s,
|
||||
verbosity=0,
|
||||
eval_splits=eval_splits,
|
||||
save_predictions=True,
|
||||
output_folder=f"{results_folder}/stage1",
|
||||
encode_kwargs={"show_progress_bar": False},
|
||||
mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
|
||||
tasks=tasks, languages=languages, eval_splits=eval_splits
|
||||
)
|
||||
|
||||
results = evaluation.run(
|
||||
mteb.evaluate(
|
||||
bm25s,
|
||||
mteb_tasks,
|
||||
prediction_folder=prediction_folder,
|
||||
show_progress_bar=False,
|
||||
# don't save results for test runs
|
||||
cache=None,
|
||||
overwrite_strategy="always",
|
||||
)
|
||||
|
||||
second_stage_tasks = []
|
||||
for task in mteb_tasks:
|
||||
second_stage_tasks.append(
|
||||
task.convert_to_reranking(
|
||||
prediction_folder,
|
||||
top_k=10,
|
||||
)
|
||||
)
|
||||
|
||||
results = mteb.evaluate(
|
||||
cross_encoder,
|
||||
verbosity=0,
|
||||
eval_splits=eval_splits,
|
||||
top_k=10,
|
||||
save_predictions=True,
|
||||
output_folder=f"{results_folder}/stage2",
|
||||
previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
|
||||
encode_kwargs={"show_progress_bar": False},
|
||||
second_stage_tasks,
|
||||
show_progress_bar=False,
|
||||
cache=None,
|
||||
)
|
||||
main_score = results[0].scores["test"][0]["main_score"]
|
||||
return main_score
|
||||
@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf(
|
||||
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
|
||||
):
|
||||
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
|
||||
original_predict = hf_model.predict
|
||||
|
||||
def _predict(
|
||||
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
# vllm and st both remove the prompt, fair comparison.
|
||||
prompts = [(s[0], s[1]) for s in sentences]
|
||||
return original_predict(prompts, *args, **kwargs, batch_size=8)
|
||||
|
||||
hf_model.predict = _predict
|
||||
hf_model.original_predict = original_predict
|
||||
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
@ -310,7 +359,7 @@ def mteb_test_rerank_models(
|
||||
model_info: RerankModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
vllm_mteb_encoder=VllmMtebEncoder,
|
||||
vllm_mteb_encoder=VllmMtebCrossEncoder,
|
||||
atol=MTEB_RERANK_TOL,
|
||||
):
|
||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||
|
||||
@ -2,13 +2,15 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
import mteb
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
VllmMtebEncoder,
|
||||
VllmMtebCrossEncoder,
|
||||
mteb_test_rerank_models,
|
||||
)
|
||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
@ -103,7 +105,7 @@ class GemmaRerankerHfRunner(HfRunner):
|
||||
return torch.Tensor(scores)
|
||||
|
||||
|
||||
class GemmaMtebEncoder(VllmMtebEncoder):
|
||||
class GemmaMtebEncoder(VllmMtebCrossEncoder):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.query_template = "A: {query}\n"
|
||||
@ -111,17 +113,26 @@ class GemmaMtebEncoder(VllmMtebEncoder):
|
||||
|
||||
def predict(
|
||||
self,
|
||||
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
_sentences = []
|
||||
for query, corpus, prompt in sentences:
|
||||
query = self.query_template.format(query=query)
|
||||
corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
|
||||
_sentences.append((query, corpus, prompt))
|
||||
|
||||
return super().predict(_sentences, *args, **kwargs)
|
||||
queries = [
|
||||
self.query_template.format(query=text)
|
||||
for batch in inputs1
|
||||
for text in batch["text"]
|
||||
]
|
||||
corpus = [
|
||||
self.document_template.format(doc=text, prompt=PROMPT)
|
||||
for batch in inputs2
|
||||
for text in batch["text"]
|
||||
]
|
||||
outputs = self.llm.score(
|
||||
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
|
||||
)
|
||||
scores = np.array(outputs)
|
||||
return scores
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
|
||||
@ -70,8 +70,9 @@ class MxbaiRerankerHfRunner(HfRunner):
|
||||
return scores
|
||||
|
||||
scores = []
|
||||
for prompt in prompts:
|
||||
inputs = process_inputs([prompt])
|
||||
for query, doc, *_ in prompts:
|
||||
pairs = [(query, doc)]
|
||||
inputs = process_inputs(pairs)
|
||||
score = compute_logits(inputs)
|
||||
scores.append(score[0].item())
|
||||
return torch.Tensor(scores)
|
||||
|
||||
@ -72,8 +72,9 @@ class Qwen3RerankerHfRunner(HfRunner):
|
||||
return scores
|
||||
|
||||
scores = []
|
||||
for prompt in prompts:
|
||||
inputs = process_inputs([prompt])
|
||||
for query, doc, *_ in prompts:
|
||||
pairs = [(query, doc)]
|
||||
inputs = process_inputs(pairs)
|
||||
score = compute_logits(inputs)
|
||||
scores.append(score[0].item())
|
||||
return torch.Tensor(scores)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user