[CI] Reorganization pooling_mteb_test (#31265)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi 2025-12-24 23:36:20 +08:00 committed by GitHub
parent 7cd288a4b3
commit 1ff67df182
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 480 additions and 347 deletions

View File

@ -4,7 +4,7 @@ import os
import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import (
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
MTEB_EMBED_TASKS,
MTEB_EMBED_TOL,
OpenAIClientMtebEncoder,

View File

@ -4,7 +4,7 @@ import os
import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import (
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS,
MTEB_RERANK_TOL,

View File

@ -202,11 +202,10 @@ class TestGetScorePrompt:
tokenization_kwargs,
mock_model_no_score_template,
):
# FIXME: Models implementing SupportsScoreTemplate must use their custom
# template implementation by default to preserve existing functionality.
# Attempting to use tokenizer_config.json templates would most likely break
# these models, as often they just inherit the template from the original LLM.
# CLI --chat-template overrides are still supported.
# FIXME: For now, we only apply a template when one is explicitly provided.
# We cannot rely on the tokenizer's chat template because many models
# inherit junk templates from their base LLM, which breaks both the models
# and the tests that use them.
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",

View File

@ -0,0 +1,228 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import mteb
import numpy as np
import torch
from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import (
EmbedModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs,
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
revision="1",
release_date=None,
languages=None,
framework=[],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
modalities=["text"], # 'image' can be added to evaluate multimodal models
)
class MtebEmbedMixin(mteb.EncoderProtocol):
mteb_model_meta = _empty_model_meta
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class VllmMtebEncoder(MtebEmbedMixin):
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
class OpenAIClientMtebEncoder(MtebEmbedMixin):
def __init__(self, model_name: str, client):
self.model_name = model_name
self.client = client
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate(
encoder,
tasks,
cache=None,
show_progress_bar=False,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=model_info.max_model_len,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether the important configs in model_config are correct.
if model_info.pooling_type is not None:
assert model_config.pooler_config.pooling_type == model_info.pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embedding_size, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
outputs_tensor = torch.tensor(vllm_outputs)
assert not torch.any(torch.isnan(outputs_tensor))
embedding_size = model_config.embedding_size
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Check embeddings close to hf outputs
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol

View File

@ -7,37 +7,24 @@ from pathlib import Path
import mteb
import numpy as np
import requests
import torch
from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import (
EmbedModelInfo,
RerankModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs,
)
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
@ -60,84 +47,11 @@ _empty_model_meta = ModelMeta(
)
class VllmMtebEncoder(mteb.EncoderProtocol):
class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class OpenAIClientMtebEncoder(VllmMtebEncoder):
def __init__(self, model_name: str, client):
self.model_name = model_name
self.client = client
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
@ -164,7 +78,7 @@ class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
return scores
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url):
@ -216,102 +130,6 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate(
encoder,
tasks,
cache=None,
show_progress_bar=False,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=model_info.max_model_len,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embedding_size, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
outputs_tensor = torch.tensor(vllm_outputs)
assert not torch.any(torch.isnan(outputs_tensor))
embedding_size = model_config.embedding_size
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Check embeddings close to hf outputs
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s")
@ -391,18 +209,28 @@ def mteb_test_rerank_models(
# Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
# Maybe load chat_template.
chat_template: str | None = None
if model_info.chat_template_name is not None:
chat_template = (template_home / model_info.chat_template_name).read_text()
vllm_model.chat_template = chat_template
# Confirm whether the important configs in model_config are correct.
if model_info.pooling_type is not None:
assert model_config.pooler_config.pooling_type == model_info.pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,

View File

@ -4,90 +4,94 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-base-en",
architecture="BertModel",
mteb_score=0.779336792,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-m3",
architecture="XLMRobertaModel",
mteb_score=0.787343078,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
########## Qwen2Model
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-code-v1",
architecture="Qwen2Model",
mteb_score=0.75724465,
dtype="float32",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
]
RERANK_MODELS = [
########## XLMRobertaForSequenceClassification
CLSPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification",
enable_test=False,
),
CLSPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification",
enable_test=False,

View File

@ -9,14 +9,12 @@ import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import (
VllmMtebCrossEncoder,
mteb_test_rerank_models,
)
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from tests.models.utils import RerankModelInfo
from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification",
mteb_score=0.33757,
@ -25,6 +23,10 @@ RERANK_MODELS = [
"classifier_from_token": ["Yes"],
"method": "no_post_processing",
},
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
),
]

View File

@ -3,23 +3,29 @@
import pytest
from tests.models.utils import (
CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_rerank_models
from .mteb_score_utils import mteb_test_rerank_models
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
RerankModelInfo(
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
LASTPoolingRerankModelInfo(
RerankModelInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
),
]

View File

@ -5,36 +5,32 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"thenlper/gte-large",
mteb_score=0.76807651,
architecture="BertModel",
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
),
########### NewModel
@ -43,48 +39,64 @@ MODELS = [
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]},
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM",
pooling_type="LAST",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
########## ModernBertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353,
architecture="ModernBertModel",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695,
architecture="Qwen3ForCausalLM",
dtype="float32",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM",
dtype="float32",
@ -93,18 +105,26 @@ MODELS = [
]
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
RerankModelInfo(
# classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386,
architecture="ModernBertForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingRerankModelInfo(
RerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062,
architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
]

View File

@ -3,40 +3,44 @@
import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/e5-small",
architecture="BertModel",
mteb_score=0.742285423,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-large", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
mteb_score=0.779325955,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False,

View File

@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import (
matryoshka_fy,
)
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
RerankModelInfo,
)
from vllm import PoolingParams
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"jinaai/jina-embeddings-v3",
mteb_score=0.824413164,
architecture="XLMRobertaModel",
is_matryoshka=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
dtype="float32",
)
]
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
RerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
)
]

View File

@ -6,9 +6,9 @@ import pytest
import torch
from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from tests.models.utils import RerankModelInfo
from .mteb_utils import mteb_test_rerank_models
from .mteb_score_utils import mteb_test_rerank_models
mxbai_rerank_hf_overrides = {
"architectures": ["Qwen2ForSequenceClassification"],
@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273,
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
LASTPoolingRerankModelInfo(
RerankModelInfo(
"mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,

View File

@ -3,29 +3,39 @@
import pytest
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
mteb_test_embed_models,
)
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
mteb_test_rerank_models,
)
from tests.models.utils import (
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"nvidia/llama-nemotron-embed-1b-v2",
architecture="LlamaBidirectionalModel",
mteb_score=0.689164662128673,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
)
]
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"nvidia/llama-nemotron-rerank-1b-v2",
architecture="LlamaBidirectionalForSequenceClassification",
chat_template_name="nemotron-rerank.jinja",
mteb_score=0.33994,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
]

View File

@ -4,30 +4,38 @@
import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
mteb_score=0.737568559,
enable_test=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
mteb_score=0.715488912,
enable_test=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
]

View File

@ -6,10 +6,10 @@ import pytest
import torch
from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from tests.models.utils import RerankModelInfo
from tests.utils import multi_gpu_test
from .mteb_utils import mteb_test_rerank_models
from .mteb_score_utils import mteb_test_rerank_models
qwen3_reranker_hf_overrides = {
"architectures": ["Qwen3ForSequenceClassification"],
@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
LASTPoolingRerankModelInfo(
RerankModelInfo(
"Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification",
hf_overrides=qwen3_reranker_hf_overrides,

View File

@ -4,62 +4,82 @@
import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False,
architecture="BertModel",
mteb_score=0.714927797,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False,
architecture="NomicBertModel",
mteb_score=0.681146831,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
architecture="BertModel",
mteb_score=0.649088363,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True,
architecture="XLMRobertaModel",
mteb_score=0.712258299,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True,
architecture="GteModel",
mteb_score=0.706622444,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
]

View File

@ -3,25 +3,31 @@
import pytest
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
)
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
# ST models with projector (Dense) layers
ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"TencentBAC/Conan-embedding-v1",
architecture="BertModel",
mteb_score=0.688611955,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"google/embeddinggemma-300m",
architecture="Gemma3TextModel",
mteb_score=0.7473819294684156,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
dtype="float32",
),

View File

@ -10,7 +10,7 @@ import torch
import torch.nn.functional as F
from transformers import PretrainedConfig
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
from vllm.multimodal.processing import InputProcessingContext
from vllm.tokenizers import cached_tokenizer_from_config
@ -375,7 +375,10 @@ class ModelInfo:
max_model_len: int | None = None
hf_dtype: str = "float32"
hf_overrides: dict[str, Any] | None = None
default_pooling_type: str = ""
pooling_type: str | None = None
attn_type: AttnTypeStr | None = None
is_prefix_caching_supported: bool | None = None
is_chunked_prefill_supported: bool | None = None
enable_test: bool = True
@ -386,32 +389,12 @@ class EmbedModelInfo(ModelInfo):
matryoshka_dimensions: list[int] | None = None
@dataclass
class CLSPoolingEmbedModelInfo(EmbedModelInfo):
default_pooling_type: str = "CLS"
@dataclass
class LASTPoolingEmbedModelInfo(EmbedModelInfo):
default_pooling_type: str = "LAST"
@dataclass
class RerankModelInfo(ModelInfo):
mteb_score: float | None = None
chat_template_name: str | None = None
@dataclass
class CLSPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type: str = "CLS"
@dataclass
class LASTPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type: str = "LAST"
@dataclass
class GenerateModelInfo(ModelInfo):
hf_dtype: str = "auto"