diff --git a/tests/entrypoints/pooling/embed/test_correctness_mteb.py b/tests/entrypoints/pooling/embed/test_correctness_mteb.py index 8cdd3d3c858d5..4c8d9f0d82a24 100644 --- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py +++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py @@ -4,7 +4,7 @@ import os import pytest -from tests.models.language.pooling_mteb_test.mteb_utils import ( +from tests.models.language.pooling_mteb_test.mteb_embed_utils import ( MTEB_EMBED_TASKS, MTEB_EMBED_TOL, OpenAIClientMtebEncoder, diff --git a/tests/entrypoints/pooling/score/test_correctness_mteb.py b/tests/entrypoints/pooling/score/test_correctness_mteb.py index 71e75b93504ac..1ee45b44596fa 100644 --- a/tests/entrypoints/pooling/score/test_correctness_mteb.py +++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py @@ -4,7 +4,7 @@ import os import pytest -from tests.models.language.pooling_mteb_test.mteb_utils import ( +from tests.models.language.pooling_mteb_test.mteb_score_utils import ( MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL, diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/score/test_utils.py index 92b443c48825f..356fd0ad6678f 100644 --- a/tests/entrypoints/pooling/score/test_utils.py +++ b/tests/entrypoints/pooling/score/test_utils.py @@ -202,11 +202,10 @@ class TestGetScorePrompt: tokenization_kwargs, mock_model_no_score_template, ): - # FIXME: Models implementing SupportsScoreTemplate must use their custom - # template implementation by default to preserve existing functionality. - # Attempting to use tokenizer_config.json templates would most likely break - # these models, as often they just inherit the template from the original LLM. - # CLI --chat-template overrides are still supported. + # FIXME: For now, we only apply a template when one is explicitly provided. + # We cannot rely on the tokenizer's chat template because many models + # inherit junk templates from their base LLM, which breaks both the models + # and the tests that use them. with ( patch( "vllm.model_executor.model_loader.get_model_cls", diff --git a/tests/models/language/pooling_mteb_test/mteb_embed_utils.py b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py new file mode 100644 index 0000000000000..a0b469f930644 --- /dev/null +++ b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import mteb +import numpy as np +import torch +from mteb.models import ModelMeta +from mteb.types import Array +from torch.utils.data import DataLoader + +import tests.ci_envs as ci_envs +from tests.models.utils import ( + EmbedModelInfo, + check_embeddings_close, + get_vllm_extra_kwargs, +) + +# Most embedding models on the STS12 task (See #17175): +# - Model implementation and minor changes in tensor dtype +# results in differences less than 1e-4 +# - Different model results in differences more than 1e-3 +# 1e-4 is a good tolerance threshold +MTEB_EMBED_TASKS = ["STS12"] +MTEB_EMBED_TOL = 1e-4 + + +_empty_model_meta = ModelMeta( + loader=None, + name="vllm/model", + revision="1", + release_date=None, + languages=None, + framework=[], + similarity_fn_name=None, + n_parameters=None, + memory_usage_mb=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=None, + public_training_code=None, + public_training_data=None, + use_instructions=None, + training_datasets=None, + modalities=["text"], # 'image' can be added to evaluate multimodal models +) + + +class MtebEmbedMixin(mteb.EncoderProtocol): + mteb_model_meta = _empty_model_meta + + def similarity( + self, + embeddings1: np.ndarray, + embeddings2: np.ndarray, + ) -> np.ndarray: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T) + return sim + + def similarity_pairwise( + self, + embeddings1: Array, + embeddings2: Array, + ) -> Array: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.sum(embeddings1 * embeddings2, axis=1) / ( + norm1.flatten() * norm2.flatten() + ) + return sim + + +class VllmMtebEncoder(MtebEmbedMixin): + def __init__(self, vllm_model): + self.llm = vllm_model + self.rng = np.random.default_rng(seed=42) + + def encode( + self, + inputs: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + # Hoping to discover potential scheduling + # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] + r = self.rng.permutation(len(sentences)) + sentences = [sentences[i] for i in r] + outputs = self.llm.embed(sentences, use_tqdm=False) + embeds = np.array(outputs) + embeds = embeds[np.argsort(r)] + return embeds + + +class OpenAIClientMtebEncoder(MtebEmbedMixin): + def __init__(self, model_name: str, client): + self.model_name = model_name + self.client = client + self.rng = np.random.default_rng(seed=42) + + def encode( + self, + inputs: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + # Hoping to discover potential scheduling + # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] + r = self.rng.permutation(len(sentences)) + sentences = [sentences[i] for i in r] + + embeddings = self.client.embeddings.create( + model=self.model_name, input=sentences + ) + outputs = [d.embedding for d in embeddings.data] + embeds = np.array(outputs) + embeds = embeds[np.argsort(r)] + return embeds + + +def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks): + tasks = mteb.get_tasks(tasks=tasks) + results = mteb.evaluate( + encoder, + tasks, + cache=None, + show_progress_bar=False, + ) + + main_score = results[0].scores["test"][0]["main_score"] + return main_score + + +def mteb_test_embed_models( + hf_runner, + vllm_runner, + model_info: EmbedModelInfo, + vllm_extra_kwargs=None, + hf_model_callback=None, + atol=MTEB_EMBED_TOL, +): + vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) + + # Test embed_dims, isnan and whether to use normalize + example_prompts = ["The chef prepared a delicious meal." * 1000] + + with vllm_runner( + model_info.name, + runner="pooling", + max_model_len=model_info.max_model_len, + **vllm_extra_kwargs, + ) as vllm_model: + model_config = vllm_model.llm.llm_engine.model_config + + # Confirm whether vllm is using the correct architecture + if model_info.architecture: + assert model_info.architecture in model_config.architectures + + # Confirm whether the important configs in model_config are correct. + if model_info.pooling_type is not None: + assert model_config.pooler_config.pooling_type == model_info.pooling_type + if model_info.attn_type is not None: + assert model_config.attn_type == model_info.attn_type + if model_info.is_prefix_caching_supported is not None: + assert ( + model_config.is_prefix_caching_supported + == model_info.is_prefix_caching_supported + ) + if model_info.is_chunked_prefill_supported is not None: + assert ( + model_config.is_chunked_prefill_supported + == model_info.is_chunked_prefill_supported + ) + + vllm_main_score = run_mteb_embed_task( + VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS + ) + vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype + head_dtype = model_config.head_dtype + + # Test embedding_size, isnan and whether to use normalize + vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1) + outputs_tensor = torch.tensor(vllm_outputs) + assert not torch.any(torch.isnan(outputs_tensor)) + embedding_size = model_config.embedding_size + assert torch.tensor(vllm_outputs).shape[-1] == embedding_size + + # Accelerate mteb test by setting + # SentenceTransformers mteb score to a constant + if model_info.mteb_score is None: + with hf_runner( + model_info.name, + is_sentence_transformer=True, + dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype, + ) as hf_model: + # e.g. setting default parameters for the encode method of hf_runner + if hf_model_callback is not None: + hf_model_callback(hf_model) + + st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) + st_dtype = next(hf_model.model.parameters()).dtype + + # Check embeddings close to hf outputs + hf_outputs = hf_model.encode(example_prompts) + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) + else: + st_main_score = model_info.mteb_score + st_dtype = "Constant" + + print("Model:", model_info.name) + print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score) + print("SentenceTransformers:", st_dtype, st_main_score) + print("Difference:", st_main_score - vllm_main_score) + + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < atol diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_score_utils.py similarity index 51% rename from tests/models/language/pooling_mteb_test/mteb_utils.py rename to tests/models/language/pooling_mteb_test/mteb_score_utils.py index 11e05a635c1d1..6c13502317736 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py @@ -7,37 +7,24 @@ from pathlib import Path import mteb import numpy as np import requests -import torch from mteb.models import ModelMeta -from mteb.types import Array from torch.utils.data import DataLoader -import tests.ci_envs as ci_envs from tests.models.utils import ( - EmbedModelInfo, RerankModelInfo, - check_embeddings_close, get_vllm_extra_kwargs, ) -template_home = ( - Path(__file__).parent.parent.parent.parent.parent - / "examples/pooling/score/template" -) - -# Most embedding models on the STS12 task (See #17175): -# - Model implementation and minor changes in tensor dtype -# results in differences less than 1e-4 -# - Different model results in differences more than 1e-3 -# 1e-4 is a good tolerance threshold -MTEB_EMBED_TASKS = ["STS12"] -MTEB_EMBED_TOL = 1e-4 - # See #19344 MTEB_RERANK_TASKS = ["NFCorpus"] MTEB_RERANK_LANGS = ["eng"] MTEB_RERANK_TOL = 2e-3 +template_home = ( + Path(__file__).parent.parent.parent.parent.parent + / "examples/pooling/score/template" +) + _empty_model_meta = ModelMeta( loader=None, name="vllm/model", @@ -60,84 +47,11 @@ _empty_model_meta = ModelMeta( ) -class VllmMtebEncoder(mteb.EncoderProtocol): +class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol): mteb_model_meta = _empty_model_meta - def __init__(self, vllm_model): - self.llm = vllm_model - self.rng = np.random.default_rng(seed=42) - - def encode( - self, - inputs: DataLoader[mteb.types.BatchedInput], - *args, - **kwargs, - ) -> np.ndarray: - # Hoping to discover potential scheduling - # issues by randomizing the order. - sentences = [text for batch in inputs for text in batch["text"]] - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] - outputs = self.llm.embed(sentences, use_tqdm=False) - embeds = np.array(outputs) - embeds = embeds[np.argsort(r)] - return embeds - - def similarity( - self, - embeddings1: np.ndarray, - embeddings2: np.ndarray, - ) -> np.ndarray: - # Cosine similarity - norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) - norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) - sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T) - return sim - - def similarity_pairwise( - self, - embeddings1: Array, - embeddings2: Array, - ) -> Array: - # Cosine similarity - norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) - norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) - sim = np.sum(embeddings1 * embeddings2, axis=1) / ( - norm1.flatten() * norm2.flatten() - ) - return sim - - -class OpenAIClientMtebEncoder(VllmMtebEncoder): - def __init__(self, model_name: str, client): - self.model_name = model_name - self.client = client - self.rng = np.random.default_rng(seed=42) - - def encode( - self, - inputs: DataLoader[mteb.types.BatchedInput], - *args, - **kwargs, - ) -> np.ndarray: - # Hoping to discover potential scheduling - # issues by randomizing the order. - sentences = [text for batch in inputs for text in batch["text"]] - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] - - embeddings = self.client.embeddings.create( - model=self.model_name, input=sentences - ) - outputs = [d.embedding for d in embeddings.data] - embeds = np.array(outputs) - embeds = embeds[np.argsort(r)] - return embeds - - -class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): - mteb_model_meta = _empty_model_meta +class VllmMtebCrossEncoder(MtebCrossEncoderMixin): def __init__(self, vllm_model): self.llm = vllm_model self.rng = np.random.default_rng(seed=42) @@ -164,7 +78,7 @@ class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): return scores -class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): +class ScoreClientMtebEncoder(MtebCrossEncoderMixin): mteb_model_meta = _empty_model_meta def __init__(self, model_name: str, url): @@ -216,102 +130,6 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder): return response["results"][0]["relevance_score"] -def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks): - tasks = mteb.get_tasks(tasks=tasks) - results = mteb.evaluate( - encoder, - tasks, - cache=None, - show_progress_bar=False, - ) - - main_score = results[0].scores["test"][0]["main_score"] - return main_score - - -def mteb_test_embed_models( - hf_runner, - vllm_runner, - model_info: EmbedModelInfo, - vllm_extra_kwargs=None, - hf_model_callback=None, - atol=MTEB_EMBED_TOL, -): - vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) - - # Test embed_dims, isnan and whether to use normalize - example_prompts = ["The chef prepared a delicious meal." * 1000] - - with vllm_runner( - model_info.name, - runner="pooling", - max_model_len=model_info.max_model_len, - **vllm_extra_kwargs, - ) as vllm_model: - model_config = vllm_model.llm.llm_engine.model_config - - # Confirm whether vllm is using the correct architecture - if model_info.architecture: - assert model_info.architecture in model_config.architectures - - # Confirm whether vllm uses the correct default_pooling_type, which - # relates to whether chunked prefill and prefix caching are enabled - assert ( - model_config._model_info.default_pooling_type - == model_info.default_pooling_type - ) - - vllm_main_score = run_mteb_embed_task( - VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS - ) - vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype - head_dtype = model_config.head_dtype - - # Test embedding_size, isnan and whether to use normalize - vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1) - outputs_tensor = torch.tensor(vllm_outputs) - assert not torch.any(torch.isnan(outputs_tensor)) - embedding_size = model_config.embedding_size - assert torch.tensor(vllm_outputs).shape[-1] == embedding_size - - # Accelerate mteb test by setting - # SentenceTransformers mteb score to a constant - if model_info.mteb_score is None: - with hf_runner( - model_info.name, - is_sentence_transformer=True, - dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype, - ) as hf_model: - # e.g. setting default parameters for the encode method of hf_runner - if hf_model_callback is not None: - hf_model_callback(hf_model) - - st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) - st_dtype = next(hf_model.model.parameters()).dtype - - # Check embeddings close to hf outputs - hf_outputs = hf_model.encode(example_prompts) - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) - else: - st_main_score = model_info.mteb_score - st_dtype = "Constant" - - print("Model:", model_info.name) - print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score) - print("SentenceTransformers:", st_dtype, st_main_score) - print("Difference:", st_main_score - vllm_main_score) - - # We are not concerned that the vllm mteb results are better - # than SentenceTransformers, so we only perform one-sided testing. - assert st_main_score - vllm_main_score < atol - - def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): with tempfile.TemporaryDirectory() as prediction_folder: bm25s = mteb.get_model("bm25s") @@ -391,18 +209,28 @@ def mteb_test_rerank_models( # Score API is only enabled for num_labels == 1 assert model_config.hf_config.num_labels == 1 - # Confirm whether vllm uses the correct default_pooling_type, which - # relates to whether chunked prefill and prefix caching are enabled - assert ( - model_config._model_info.default_pooling_type - == model_info.default_pooling_type - ) - + # Maybe load chat_template. chat_template: str | None = None if model_info.chat_template_name is not None: chat_template = (template_home / model_info.chat_template_name).read_text() vllm_model.chat_template = chat_template + # Confirm whether the important configs in model_config are correct. + if model_info.pooling_type is not None: + assert model_config.pooler_config.pooling_type == model_info.pooling_type + if model_info.attn_type is not None: + assert model_config.attn_type == model_info.attn_type + if model_info.is_prefix_caching_supported is not None: + assert ( + model_config.is_prefix_caching_supported + == model_info.is_prefix_caching_supported + ) + if model_info.is_chunked_prefill_supported is not None: + assert ( + model_config.is_chunked_prefill_supported + == model_info.is_chunked_prefill_supported + ) + vllm_main_score = run_mteb_rerank( vllm_mteb_encoder(vllm_model), tasks=MTEB_RERANK_TASKS, diff --git a/tests/models/language/pooling_mteb_test/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py index bad13e2457146..2e55622a5d48c 100644 --- a/tests/models/language/pooling_mteb_test/test_baai.py +++ b/tests/models/language/pooling_mteb_test/test_baai.py @@ -4,90 +4,94 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.utils import ( - CLSPoolingEmbedModelInfo, - CLSPoolingRerankModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models +from .mteb_embed_utils import mteb_test_embed_models +from .mteb_score_utils import mteb_test_rerank_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-base-en", architecture="BertModel", mteb_score=0.779336792, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-base-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-small-en", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-small-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-large-en", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-large-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo( "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False ), ########## XLMRobertaModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-m3", architecture="XLMRobertaModel", mteb_score=0.787343078, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ########## Qwen2Model - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-code-v1", architecture="Qwen2Model", mteb_score=0.75724465, dtype="float32", + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), ] RERANK_MODELS = [ ########## XLMRobertaForSequenceClassification - CLSPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-base", architecture="XLMRobertaForSequenceClassification", mteb_score=0.32398, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-large", architecture="XLMRobertaForSequenceClassification", enable_test=False, ), - CLSPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-v2-m3", architecture="XLMRobertaForSequenceClassification", enable_test=False, diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index 6b2e469644926..00f2d33546efc 100644 --- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -9,14 +9,12 @@ import torch from torch.utils.data import DataLoader from tests.conftest import HfRunner -from tests.models.language.pooling_mteb_test.mteb_utils import ( - VllmMtebCrossEncoder, - mteb_test_rerank_models, -) -from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo +from tests.models.utils import RerankModelInfo + +from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-v2-gemma", architecture="GemmaForSequenceClassification", mteb_score=0.33757, @@ -25,6 +23,10 @@ RERANK_MODELS = [ "classifier_from_token": ["Yes"], "method": "no_post_processing", }, + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_cross_encoder.py b/tests/models/language/pooling_mteb_test/test_cross_encoder.py index 638ffc7a62b0e..8bca49bb5b023 100644 --- a/tests/models/language/pooling_mteb_test/test_cross_encoder.py +++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py @@ -3,23 +3,29 @@ import pytest from tests.models.utils import ( - CLSPoolingRerankModelInfo, - LASTPoolingRerankModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_rerank_models +from .mteb_score_utils import mteb_test_rerank_models RERANK_MODELS = [ - CLSPoolingRerankModelInfo( + RerankModelInfo( "cross-encoder/ms-marco-TinyBERT-L-2-v2", mteb_score=0.32898, architecture="BertForSequenceClassification", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), - LASTPoolingRerankModelInfo( + RerankModelInfo( "tomaarsen/Qwen3-Reranker-0.6B-seq-cls", mteb_score=0.25736, architecture="Qwen3ForSequenceClassification", + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py index a22821fd65b5a..3d1d5aa84091e 100644 --- a/tests/models/language/pooling_mteb_test/test_gte.py +++ b/tests/models/language/pooling_mteb_test/test_gte.py @@ -5,36 +5,32 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.utils import ( - CLSPoolingEmbedModelInfo, - CLSPoolingRerankModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models +from .mteb_embed_utils import mteb_test_embed_models +from .mteb_score_utils import mteb_test_rerank_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "thenlper/gte-large", mteb_score=0.76807651, architecture="BertModel", + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( - "thenlper/gte-base", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "thenlper/gte-small", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False), + EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False), + EmbedModelInfo( "thenlper/gte-large-zh", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( - "thenlper/gte-base-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo( "thenlper/gte-small-zh", architecture="BertModel", enable_test=False ), ########### NewModel @@ -43,48 +39,64 @@ MODELS = [ # - whether to use token_type_embeddings # - whether to use context expansion # So only test one (the most widely used) model - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-multilingual-base", architecture="GteNewModel", mteb_score=0.775074696, hf_overrides={"architectures": ["GteNewModel"]}, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-base-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-large-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, enable_test=False, ), ########### Qwen2ForCausalLM - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-Qwen2-1.5B-instruct", mteb_score=0.758473459018872, architecture="Qwen2ForCausalLM", + pooling_type="LAST", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ########## ModernBertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-modernbert-base", mteb_score=0.748193353, architecture="ModernBertModel", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ########## Qwen3ForCausalLM - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "Qwen/Qwen3-Embedding-0.6B", mteb_score=0.771163695, architecture="Qwen3ForCausalLM", dtype="float32", + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "Qwen/Qwen3-Embedding-4B", architecture="Qwen3ForCausalLM", dtype="float32", @@ -93,18 +105,26 @@ MODELS = [ ] RERANK_MODELS = [ - CLSPoolingRerankModelInfo( + RerankModelInfo( # classifier_pooling: mean "Alibaba-NLP/gte-reranker-modernbert-base", mteb_score=0.33386, architecture="ModernBertForSequenceClassification", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingRerankModelInfo( + RerankModelInfo( "Alibaba-NLP/gte-multilingual-reranker-base", mteb_score=0.33062, architecture="GteNewForSequenceClassification", hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_intfloat.py b/tests/models/language/pooling_mteb_test/test_intfloat.py index 1d078db69236a..377ab600aa443 100644 --- a/tests/models/language/pooling_mteb_test/test_intfloat.py +++ b/tests/models/language/pooling_mteb_test/test_intfloat.py @@ -3,40 +3,44 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models -from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from tests.models.utils import EmbedModelInfo -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/e5-small", architecture="BertModel", mteb_score=0.742285423, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( - "intfloat/e5-base", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "intfloat/e5-large", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False), + EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False), + EmbedModelInfo( "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False ), ########## XLMRobertaModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/multilingual-e5-base", architecture="XLMRobertaModel", mteb_score=0.779325955, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/multilingual-e5-large", architecture="XLMRobertaModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/multilingual-e5-large-instruct", architecture="XLMRobertaModel", enable_test=False, diff --git a/tests/models/language/pooling_mteb_test/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py index c2065bcd6eb4c..b98ac91b97573 100644 --- a/tests/models/language/pooling_mteb_test/test_jina.py +++ b/tests/models/language/pooling_mteb_test/test_jina.py @@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import ( matryoshka_fy, ) from tests.models.utils import ( - CLSPoolingEmbedModelInfo, - CLSPoolingRerankModelInfo, EmbedModelInfo, RerankModelInfo, ) from vllm import PoolingParams -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models +from .mteb_embed_utils import mteb_test_embed_models +from .mteb_score_utils import mteb_test_rerank_models EMBEDDING_MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "jinaai/jina-embeddings-v3", mteb_score=0.824413164, architecture="XLMRobertaModel", is_matryoshka=True, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, dtype="float32", ) ] RERANK_MODELS = [ - CLSPoolingRerankModelInfo( + RerankModelInfo( "jinaai/jina-reranker-v2-base-multilingual", mteb_score=0.33643, architecture="XLMRobertaForSequenceClassification", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ) ] diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py index a6f2a89b268f1..50dc6a0bd0ad1 100644 --- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py +++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py @@ -6,9 +6,9 @@ import pytest import torch from tests.conftest import HfRunner -from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo +from tests.models.utils import RerankModelInfo -from .mteb_utils import mteb_test_rerank_models +from .mteb_score_utils import mteb_test_rerank_models mxbai_rerank_hf_overrides = { "architectures": ["Qwen2ForSequenceClassification"], @@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = { } RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "mixedbread-ai/mxbai-rerank-base-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, mteb_score=0.273, + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), - LASTPoolingRerankModelInfo( + RerankModelInfo( "mixedbread-ai/mxbai-rerank-large-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, diff --git a/tests/models/language/pooling_mteb_test/test_nemotron.py b/tests/models/language/pooling_mteb_test/test_nemotron.py index 167c3fcf50d1c..c91616c9ec01e 100644 --- a/tests/models/language/pooling_mteb_test/test_nemotron.py +++ b/tests/models/language/pooling_mteb_test/test_nemotron.py @@ -3,29 +3,39 @@ import pytest +from tests.models.language.pooling_mteb_test.mteb_embed_utils import ( + mteb_test_embed_models, +) +from tests.models.language.pooling_mteb_test.mteb_score_utils import ( + mteb_test_rerank_models, +) from tests.models.utils import ( EmbedModelInfo, - LASTPoolingEmbedModelInfo, - LASTPoolingRerankModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models - EMBEDDING_MODELS = [ - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "nvidia/llama-nemotron-embed-1b-v2", architecture="LlamaBidirectionalModel", mteb_score=0.689164662128673, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ) ] RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "nvidia/llama-nemotron-rerank-1b-v2", architecture="LlamaBidirectionalForSequenceClassification", chat_template_name="nemotron-rerank.jinja", mteb_score=0.33994, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_nomic.py b/tests/models/language/pooling_mteb_test/test_nomic.py index c54a43052483a..06c568026a75a 100644 --- a/tests/models/language/pooling_mteb_test/test_nomic.py +++ b/tests/models/language/pooling_mteb_test/test_nomic.py @@ -4,30 +4,38 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models -from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from tests.models.utils import EmbedModelInfo -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/nomic-embed-text-v1", architecture="NomicBertModel", mteb_score=0.737568559, enable_test=True, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/nomic-embed-text-v1.5", architecture="NomicBertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/nomic-embed-text-v2-moe", architecture="NomicBertModel", mteb_score=0.715488912, enable_test=True, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py index 9a1be6c0be1d6..a8e79c8391072 100644 --- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py +++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py @@ -6,10 +6,10 @@ import pytest import torch from tests.conftest import HfRunner -from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo +from tests.models.utils import RerankModelInfo from tests.utils import multi_gpu_test -from .mteb_utils import mteb_test_rerank_models +from .mteb_score_utils import mteb_test_rerank_models qwen3_reranker_hf_overrides = { "architectures": ["Qwen3ForSequenceClassification"], @@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = { } RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "Qwen/Qwen3-Reranker-0.6B", architecture="Qwen3ForSequenceClassification", mteb_score=0.25736, hf_overrides=qwen3_reranker_hf_overrides, + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), - LASTPoolingRerankModelInfo( + RerankModelInfo( "Qwen/Qwen3-Reranker-4B", architecture="Qwen3ForSequenceClassification", hf_overrides=qwen3_reranker_hf_overrides, diff --git a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py index 3c30628aeaa49..37597a7e9ebab 100644 --- a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py @@ -4,62 +4,82 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models -from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from tests.models.utils import EmbedModelInfo -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-xs", is_matryoshka=False, architecture="BertModel", mteb_score=0.714927797, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-s", is_matryoshka=False, architecture="BertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m", is_matryoshka=False, architecture="BertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m-long", is_matryoshka=False, architecture="NomicBertModel", mteb_score=0.681146831, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-l", is_matryoshka=False, architecture="BertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m-v1.5", is_matryoshka=True, architecture="BertModel", mteb_score=0.649088363, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-l-v2.0", is_matryoshka=True, architecture="XLMRobertaModel", mteb_score=0.712258299, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m-v2.0", is_matryoshka=True, architecture="GteModel", mteb_score=0.706622444, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py index 74fe4b9bcc03f..c1fd61b8e2270 100644 --- a/tests/models/language/pooling_mteb_test/test_st_projector.py +++ b/tests/models/language/pooling_mteb_test/test_st_projector.py @@ -3,25 +3,31 @@ import pytest from tests.models.utils import ( - CLSPoolingEmbedModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, ) -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models # ST models with projector (Dense) layers ST_PROJECTOR_MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "TencentBAC/Conan-embedding-v1", architecture="BertModel", mteb_score=0.688611955, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "google/embeddinggemma-300m", architecture="Gemma3TextModel", mteb_score=0.7473819294684156, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, dtype="float32", ), diff --git a/tests/models/utils.py b/tests/models/utils.py index bf26c21fb5f58..12544bc96bb5a 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -10,7 +10,7 @@ import torch import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.config.model import ModelConfig, ModelDType, RunnerOption +from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext from vllm.tokenizers import cached_tokenizer_from_config @@ -375,7 +375,10 @@ class ModelInfo: max_model_len: int | None = None hf_dtype: str = "float32" hf_overrides: dict[str, Any] | None = None - default_pooling_type: str = "" + pooling_type: str | None = None + attn_type: AttnTypeStr | None = None + is_prefix_caching_supported: bool | None = None + is_chunked_prefill_supported: bool | None = None enable_test: bool = True @@ -386,32 +389,12 @@ class EmbedModelInfo(ModelInfo): matryoshka_dimensions: list[int] | None = None -@dataclass -class CLSPoolingEmbedModelInfo(EmbedModelInfo): - default_pooling_type: str = "CLS" - - -@dataclass -class LASTPoolingEmbedModelInfo(EmbedModelInfo): - default_pooling_type: str = "LAST" - - @dataclass class RerankModelInfo(ModelInfo): mteb_score: float | None = None chat_template_name: str | None = None -@dataclass -class CLSPoolingRerankModelInfo(RerankModelInfo): - default_pooling_type: str = "CLS" - - -@dataclass -class LASTPoolingRerankModelInfo(RerankModelInfo): - default_pooling_type: str = "LAST" - - @dataclass class GenerateModelInfo(ModelInfo): hf_dtype: str = "auto"