[CI] Reorganization pooling_mteb_test (#31265)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi 2025-12-24 23:36:20 +08:00 committed by GitHub
parent 7cd288a4b3
commit 1ff67df182
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 480 additions and 347 deletions

View File

@ -4,7 +4,7 @@ import os
import pytest import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
MTEB_EMBED_TASKS, MTEB_EMBED_TASKS,
MTEB_EMBED_TOL, MTEB_EMBED_TOL,
OpenAIClientMtebEncoder, OpenAIClientMtebEncoder,

View File

@ -4,7 +4,7 @@ import os
import pytest import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.language.pooling_mteb_test.mteb_score_utils import (
MTEB_RERANK_LANGS, MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS, MTEB_RERANK_TASKS,
MTEB_RERANK_TOL, MTEB_RERANK_TOL,

View File

@ -202,11 +202,10 @@ class TestGetScorePrompt:
tokenization_kwargs, tokenization_kwargs,
mock_model_no_score_template, mock_model_no_score_template,
): ):
# FIXME: Models implementing SupportsScoreTemplate must use their custom # FIXME: For now, we only apply a template when one is explicitly provided.
# template implementation by default to preserve existing functionality. # We cannot rely on the tokenizer's chat template because many models
# Attempting to use tokenizer_config.json templates would most likely break # inherit junk templates from their base LLM, which breaks both the models
# these models, as often they just inherit the template from the original LLM. # and the tests that use them.
# CLI --chat-template overrides are still supported.
with ( with (
patch( patch(
"vllm.model_executor.model_loader.get_model_cls", "vllm.model_executor.model_loader.get_model_cls",

View File

@ -0,0 +1,228 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import mteb
import numpy as np
import torch
from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import (
EmbedModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs,
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
revision="1",
release_date=None,
languages=None,
framework=[],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
modalities=["text"], # 'image' can be added to evaluate multimodal models
)
class MtebEmbedMixin(mteb.EncoderProtocol):
mteb_model_meta = _empty_model_meta
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class VllmMtebEncoder(MtebEmbedMixin):
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
class OpenAIClientMtebEncoder(MtebEmbedMixin):
def __init__(self, model_name: str, client):
self.model_name = model_name
self.client = client
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate(
encoder,
tasks,
cache=None,
show_progress_bar=False,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=model_info.max_model_len,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether the important configs in model_config are correct.
if model_info.pooling_type is not None:
assert model_config.pooler_config.pooling_type == model_info.pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embedding_size, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
outputs_tensor = torch.tensor(vllm_outputs)
assert not torch.any(torch.isnan(outputs_tensor))
embedding_size = model_config.embedding_size
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Check embeddings close to hf outputs
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol

View File

@ -7,37 +7,24 @@ from pathlib import Path
import mteb import mteb
import numpy as np import numpy as np
import requests import requests
import torch
from mteb.models import ModelMeta from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import ( from tests.models.utils import (
EmbedModelInfo,
RerankModelInfo, RerankModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs, get_vllm_extra_kwargs,
) )
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4
# See #19344 # See #19344
MTEB_RERANK_TASKS = ["NFCorpus"] MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["eng"] MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3 MTEB_RERANK_TOL = 2e-3
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
_empty_model_meta = ModelMeta( _empty_model_meta = ModelMeta(
loader=None, loader=None,
name="vllm/model", name="vllm/model",
@ -60,84 +47,11 @@ _empty_model_meta = ModelMeta(
) )
class VllmMtebEncoder(mteb.EncoderProtocol): class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class OpenAIClientMtebEncoder(VllmMtebEncoder):
def __init__(self, model_name: str, client):
self.model_name = model_name
self.client = client
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
def __init__(self, vllm_model): def __init__(self, vllm_model):
self.llm = vllm_model self.llm = vllm_model
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
@ -164,7 +78,7 @@ class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
return scores return scores
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
mteb_model_meta = _empty_model_meta mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url): def __init__(self, model_name: str, url):
@ -216,102 +130,6 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
return response["results"][0]["relevance_score"] return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate(
encoder,
tasks,
cache=None,
show_progress_bar=False,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=model_info.max_model_len,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embedding_size, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
outputs_tensor = torch.tensor(vllm_outputs)
assert not torch.any(torch.isnan(outputs_tensor))
embedding_size = model_config.embedding_size
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Check embeddings close to hf outputs
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as prediction_folder: with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s") bm25s = mteb.get_model("bm25s")
@ -391,18 +209,28 @@ def mteb_test_rerank_models(
# Score API is only enabled for num_labels == 1 # Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1 assert model_config.hf_config.num_labels == 1
# Confirm whether vllm uses the correct default_pooling_type, which # Maybe load chat_template.
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
chat_template: str | None = None chat_template: str | None = None
if model_info.chat_template_name is not None: if model_info.chat_template_name is not None:
chat_template = (template_home / model_info.chat_template_name).read_text() chat_template = (template_home / model_info.chat_template_name).read_text()
vllm_model.chat_template = chat_template vllm_model.chat_template = chat_template
# Confirm whether the important configs in model_config are correct.
if model_info.pooling_type is not None:
assert model_config.pooler_config.pooling_type == model_info.pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_rerank( vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model), vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS, tasks=MTEB_RERANK_TASKS,

View File

@ -4,90 +4,94 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [ MODELS = [
########## BertModel ########## BertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-base-en", "BAAI/bge-base-en",
architecture="BertModel", architecture="BertModel",
mteb_score=0.779336792, mteb_score=0.779336792,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
), EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
CLSPoolingEmbedModelInfo( EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
"BAAI/bge-small-en", architecture="BertModel", enable_test=False EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
), EmbedModelInfo(
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
), ),
########## XLMRobertaModel ########## XLMRobertaModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-m3", "BAAI/bge-m3",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
mteb_score=0.787343078, mteb_score=0.787343078,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
########## Qwen2Model ########## Qwen2Model
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-code-v1", "BAAI/bge-code-v1",
architecture="Qwen2Model", architecture="Qwen2Model",
mteb_score=0.75724465, mteb_score=0.75724465,
dtype="float32", dtype="float32",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
] ]
RERANK_MODELS = [ RERANK_MODELS = [
########## XLMRobertaForSequenceClassification ########## XLMRobertaForSequenceClassification
CLSPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-base", "BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398, mteb_score=0.32398,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-large", "BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
enable_test=False, enable_test=False,
), ),
CLSPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-v2-m3", "BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
enable_test=False, enable_test=False,

View File

@ -9,14 +9,12 @@ import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.utils import RerankModelInfo
VllmMtebCrossEncoder,
mteb_test_rerank_models, from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models
)
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-v2-gemma", "BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification", architecture="GemmaForSequenceClassification",
mteb_score=0.33757, mteb_score=0.33757,
@ -25,6 +23,10 @@ RERANK_MODELS = [
"classifier_from_token": ["Yes"], "classifier_from_token": ["Yes"],
"method": "no_post_processing", "method": "no_post_processing",
}, },
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
), ),
] ]

View File

@ -3,23 +3,29 @@
import pytest import pytest
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_rerank_models from .mteb_score_utils import mteb_test_rerank_models
RERANK_MODELS = [ RERANK_MODELS = [
CLSPoolingRerankModelInfo( RerankModelInfo(
"cross-encoder/ms-marco-TinyBERT-L-2-v2", "cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898, mteb_score=0.32898,
architecture="BertForSequenceClassification", architecture="BertForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
LASTPoolingRerankModelInfo( RerankModelInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736, mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
), ),
] ]

View File

@ -5,36 +5,32 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [ MODELS = [
########## BertModel ########## BertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"thenlper/gte-large", "thenlper/gte-large",
mteb_score=0.76807651, mteb_score=0.76807651,
architecture="BertModel", architecture="BertModel",
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
"thenlper/gte-base", architecture="BertModel", enable_test=False EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
), EmbedModelInfo(
CLSPoolingEmbedModelInfo(
"thenlper/gte-small", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False "thenlper/gte-large-zh", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False EmbedModelInfo(
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False "thenlper/gte-small-zh", architecture="BertModel", enable_test=False
), ),
########### NewModel ########### NewModel
@ -43,48 +39,64 @@ MODELS = [
# - whether to use token_type_embeddings # - whether to use token_type_embeddings
# - whether to use context expansion # - whether to use context expansion
# So only test one (the most widely used) model # So only test one (the most widely used) model
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-multilingual-base", "Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel", architecture="GteNewModel",
mteb_score=0.775074696, mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]}, hf_overrides={"architectures": ["GteNewModel"]},
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-base-en-v1.5", "Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel", architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]}, hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-large-en-v1.5", "Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel", architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]}, hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False, enable_test=False,
), ),
########### Qwen2ForCausalLM ########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872, mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM", architecture="Qwen2ForCausalLM",
pooling_type="LAST",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
########## ModernBertModel ########## ModernBertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-modernbert-base", "Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353, mteb_score=0.748193353,
architecture="ModernBertModel", architecture="ModernBertModel",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
########## Qwen3ForCausalLM ########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"Qwen/Qwen3-Embedding-0.6B", "Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695, mteb_score=0.771163695,
architecture="Qwen3ForCausalLM", architecture="Qwen3ForCausalLM",
dtype="float32", dtype="float32",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"Qwen/Qwen3-Embedding-4B", "Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM", architecture="Qwen3ForCausalLM",
dtype="float32", dtype="float32",
@ -93,18 +105,26 @@ MODELS = [
] ]
RERANK_MODELS = [ RERANK_MODELS = [
CLSPoolingRerankModelInfo( RerankModelInfo(
# classifier_pooling: mean # classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base", "Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386, mteb_score=0.33386,
architecture="ModernBertForSequenceClassification", architecture="ModernBertForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingRerankModelInfo( RerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base", "Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062, mteb_score=0.33062,
architecture="GteNewForSequenceClassification", architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
] ]

View File

@ -3,40 +3,44 @@
import pytest import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
MODELS = [ MODELS = [
########## BertModel ########## BertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/e5-small", "intfloat/e5-small",
architecture="BertModel", architecture="BertModel",
mteb_score=0.742285423, mteb_score=0.742285423,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
"intfloat/e5-base", architecture="BertModel", enable_test=False EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
), EmbedModelInfo(
CLSPoolingEmbedModelInfo(
"intfloat/e5-large", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
), ),
########## XLMRobertaModel ########## XLMRobertaModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/multilingual-e5-base", "intfloat/multilingual-e5-base",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
mteb_score=0.779325955, mteb_score=0.779325955,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/multilingual-e5-large", "intfloat/multilingual-e5-large",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/multilingual-e5-large-instruct", "intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
enable_test=False, enable_test=False,

View File

@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import (
matryoshka_fy, matryoshka_fy,
) )
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo, EmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from vllm import PoolingParams from vllm import PoolingParams
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
EMBEDDING_MODELS = [ EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"jinaai/jina-embeddings-v3", "jinaai/jina-embeddings-v3",
mteb_score=0.824413164, mteb_score=0.824413164,
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
is_matryoshka=True, is_matryoshka=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
dtype="float32", dtype="float32",
) )
] ]
RERANK_MODELS = [ RERANK_MODELS = [
CLSPoolingRerankModelInfo( RerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual", "jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643, mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
) )
] ]

View File

@ -6,9 +6,9 @@ import pytest
import torch import torch
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.models.utils import RerankModelInfo
from .mteb_utils import mteb_test_rerank_models from .mteb_score_utils import mteb_test_rerank_models
mxbai_rerank_hf_overrides = { mxbai_rerank_hf_overrides = {
"architectures": ["Qwen2ForSequenceClassification"], "architectures": ["Qwen2ForSequenceClassification"],
@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = {
} }
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"mixedbread-ai/mxbai-rerank-base-v2", "mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification", architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides, hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273, mteb_score=0.273,
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
LASTPoolingRerankModelInfo( RerankModelInfo(
"mixedbread-ai/mxbai-rerank-large-v2", "mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification", architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides, hf_overrides=mxbai_rerank_hf_overrides,

View File

@ -3,29 +3,39 @@
import pytest import pytest
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
mteb_test_embed_models,
)
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
mteb_test_rerank_models,
)
from tests.models.utils import ( from tests.models.utils import (
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [ EMBEDDING_MODELS = [
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"nvidia/llama-nemotron-embed-1b-v2", "nvidia/llama-nemotron-embed-1b-v2",
architecture="LlamaBidirectionalModel", architecture="LlamaBidirectionalModel",
mteb_score=0.689164662128673, mteb_score=0.689164662128673,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
) )
] ]
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"nvidia/llama-nemotron-rerank-1b-v2", "nvidia/llama-nemotron-rerank-1b-v2",
architecture="LlamaBidirectionalForSequenceClassification", architecture="LlamaBidirectionalForSequenceClassification",
chat_template_name="nemotron-rerank.jinja", chat_template_name="nemotron-rerank.jinja",
mteb_score=0.33994, mteb_score=0.33994,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
] ]

View File

@ -4,30 +4,38 @@
import pytest import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
MODELS = [ MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1", "nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel", architecture="NomicBertModel",
mteb_score=0.737568559, mteb_score=0.737568559,
enable_test=True, enable_test=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1.5", "nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel", architecture="NomicBertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/nomic-embed-text-v2-moe", "nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel", architecture="NomicBertModel",
mteb_score=0.715488912, mteb_score=0.715488912,
enable_test=True, enable_test=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
] ]

View File

@ -6,10 +6,10 @@ import pytest
import torch import torch
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.models.utils import RerankModelInfo
from tests.utils import multi_gpu_test from tests.utils import multi_gpu_test
from .mteb_utils import mteb_test_rerank_models from .mteb_score_utils import mteb_test_rerank_models
qwen3_reranker_hf_overrides = { qwen3_reranker_hf_overrides = {
"architectures": ["Qwen3ForSequenceClassification"], "architectures": ["Qwen3ForSequenceClassification"],
@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = {
} }
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"Qwen/Qwen3-Reranker-0.6B", "Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736, mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides, hf_overrides=qwen3_reranker_hf_overrides,
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
LASTPoolingRerankModelInfo( RerankModelInfo(
"Qwen/Qwen3-Reranker-4B", "Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
hf_overrides=qwen3_reranker_hf_overrides, hf_overrides=qwen3_reranker_hf_overrides,

View File

@ -4,62 +4,82 @@
import pytest import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
MODELS = [ MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-xs", "Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
mteb_score=0.714927797, mteb_score=0.714927797,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-s", "Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m", "Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-long", "Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False, is_matryoshka=False,
architecture="NomicBertModel", architecture="NomicBertModel",
mteb_score=0.681146831, mteb_score=0.681146831,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l", "Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5", "Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True, is_matryoshka=True,
architecture="BertModel", architecture="BertModel",
mteb_score=0.649088363, mteb_score=0.649088363,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l-v2.0", "Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True, is_matryoshka=True,
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
mteb_score=0.712258299, mteb_score=0.712258299,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v2.0", "Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True, is_matryoshka=True,
architecture="GteModel", architecture="GteModel",
mteb_score=0.706622444, mteb_score=0.706622444,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
] ]

View File

@ -3,25 +3,31 @@
import pytest import pytest
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
) )
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
# ST models with projector (Dense) layers # ST models with projector (Dense) layers
ST_PROJECTOR_MODELS = [ ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"TencentBAC/Conan-embedding-v1", "TencentBAC/Conan-embedding-v1",
architecture="BertModel", architecture="BertModel",
mteb_score=0.688611955, mteb_score=0.688611955,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"google/embeddinggemma-300m", "google/embeddinggemma-300m",
architecture="Gemma3TextModel", architecture="Gemma3TextModel",
mteb_score=0.7473819294684156, mteb_score=0.7473819294684156,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
dtype="float32", dtype="float32",
), ),

View File

@ -10,7 +10,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config.model import ModelConfig, ModelDType, RunnerOption from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
from vllm.multimodal.processing import InputProcessingContext from vllm.multimodal.processing import InputProcessingContext
from vllm.tokenizers import cached_tokenizer_from_config from vllm.tokenizers import cached_tokenizer_from_config
@ -375,7 +375,10 @@ class ModelInfo:
max_model_len: int | None = None max_model_len: int | None = None
hf_dtype: str = "float32" hf_dtype: str = "float32"
hf_overrides: dict[str, Any] | None = None hf_overrides: dict[str, Any] | None = None
default_pooling_type: str = "" pooling_type: str | None = None
attn_type: AttnTypeStr | None = None
is_prefix_caching_supported: bool | None = None
is_chunked_prefill_supported: bool | None = None
enable_test: bool = True enable_test: bool = True
@ -386,32 +389,12 @@ class EmbedModelInfo(ModelInfo):
matryoshka_dimensions: list[int] | None = None matryoshka_dimensions: list[int] | None = None
@dataclass
class CLSPoolingEmbedModelInfo(EmbedModelInfo):
default_pooling_type: str = "CLS"
@dataclass
class LASTPoolingEmbedModelInfo(EmbedModelInfo):
default_pooling_type: str = "LAST"
@dataclass @dataclass
class RerankModelInfo(ModelInfo): class RerankModelInfo(ModelInfo):
mteb_score: float | None = None mteb_score: float | None = None
chat_template_name: str | None = None chat_template_name: str | None = None
@dataclass
class CLSPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type: str = "CLS"
@dataclass
class LASTPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type: str = "LAST"
@dataclass @dataclass
class GenerateModelInfo(ModelInfo): class GenerateModelInfo(ModelInfo):
hf_dtype: str = "auto" hf_dtype: str = "auto"