# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from tests.models.language.pooling.embed_utils import ( correctness_test_embed_models) from tests.models.utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, EmbedModelInfo, LASTPoolingEmbedModelInfo, RerankModelInfo) from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ ########## BertModel CLSPoolingEmbedModelInfo("thenlper/gte-large", mteb_score=0.76807651, architecture="BertModel", enable_test=True), CLSPoolingEmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False), CLSPoolingEmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False), CLSPoolingEmbedModelInfo("thenlper/gte-large-zh", architecture="BertModel", enable_test=False), CLSPoolingEmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False), CLSPoolingEmbedModelInfo("thenlper/gte-small-zh", architecture="BertModel", enable_test=False), ########### NewModel # These three architectures are almost the same, but not exactly the same. # For example, # - whether to use token_type_embeddings # - whether to use context expansion # So only test one (the most widely used) model CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base", architecture="GteNewModel", mteb_score=0.775074696, hf_overrides={"architectures": ["GteNewModel"]}, enable_test=True), CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, enable_test=False), CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, enable_test=False), ########### Qwen2ForCausalLM LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", mteb_score=0.758473459018872, architecture="Qwen2ForCausalLM", enable_test=True), ########## ModernBertModel CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base", mteb_score=0.748193353, architecture="ModernBertModel", enable_test=True), ########## Qwen3ForCausalLM LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", mteb_score=0.771163695, architecture="Qwen3ForCausalLM", dtype="float32", enable_test=True), LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B", architecture="Qwen3ForCausalLM", dtype="float32", enable_test=False), ] RERANK_MODELS = [ CLSPoolingRerankModelInfo( # classifier_pooling: mean "Alibaba-NLP/gte-reranker-modernbert-base", mteb_score=0.33386, architecture="ModernBertForSequenceClassification", enable_test=True), CLSPoolingRerankModelInfo( "Alibaba-NLP/gte-multilingual-reranker-base", mteb_score=0.33062, architecture="GteNewForSequenceClassification", hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, enable_test=True), ] @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts) -> None: correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts) @pytest.mark.parametrize("model_info", RERANK_MODELS) def test_rerank_models_mteb(hf_runner, vllm_runner, model_info: RerankModelInfo) -> None: mteb_test_rerank_models(hf_runner, vllm_runner, model_info)