From fd1ce98cdd1f4699c781d2f24b39f7b16a338886 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 11 Sep 2025 21:37:51 +0800 Subject: [PATCH] [CI] Split mteb test from Language Models Test (#24634) Signed-off-by: wang.yuqi --- .buildkite/test-pipeline.yaml | 10 ++++++++++ .../pooling/correctness/test_mteb_embed.py | 7 +++---- .../pooling/correctness/test_mteb_score.py | 12 +++--------- tests/models/language/pooling_mteb_test/__init__.py | 0 .../{pooling => pooling_mteb_test}/mteb_utils.py | 0 .../{pooling => pooling_mteb_test}/test_baai.py | 10 ++++++---- .../test_bge_reranker_v2_gemma.py | 6 +++--- .../test_cross_encoder.py | 5 +++-- .../{pooling => pooling_mteb_test}/test_gte.py | 10 ++++++---- .../{pooling => pooling_mteb_test}/test_intfloat.py | 6 ++++-- .../{pooling => pooling_mteb_test}/test_jina.py | 9 +++++---- .../test_mxbai_rerank.py | 2 +- .../{pooling => pooling_mteb_test}/test_nomic.py | 6 ++++-- .../test_qwen3_reranker.py | 2 +- .../test_snowflake_arctic_embed.py | 6 ++++-- .../test_st_projector.py | 5 +++-- 16 files changed, 56 insertions(+), 40 deletions(-) create mode 100644 tests/models/language/pooling_mteb_test/__init__.py rename tests/models/language/{pooling => pooling_mteb_test}/mteb_utils.py (100%) rename tests/models/language/{pooling => pooling_mteb_test}/test_baai.py (93%) rename tests/models/language/{pooling => pooling_mteb_test}/test_bge_reranker_v2_gemma.py (96%) rename tests/models/language/{pooling => pooling_mteb_test}/test_cross_encoder.py (85%) rename tests/models/language/{pooling => pooling_mteb_test}/test_gte.py (94%) rename tests/models/language/{pooling => pooling_mteb_test}/test_intfloat.py (92%) rename tests/models/language/{pooling => pooling_mteb_test}/test_jina.py (92%) rename tests/models/language/{pooling => pooling_mteb_test}/test_mxbai_rerank.py (97%) rename tests/models/language/{pooling => pooling_mteb_test}/test_nomic.py (90%) rename tests/models/language/{pooling => pooling_mteb_test}/test_qwen3_reranker.py (98%) rename tests/models/language/{pooling => pooling_mteb_test}/test_snowflake_arctic_embed.py (94%) rename tests/models/language/{pooling => pooling_mteb_test}/test_st_projector.py (86%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 3f6b67e45de1..adbd08b13f75 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -642,6 +642,16 @@ steps: commands: - pytest -v -s models/language/pooling -m 'not core_model' +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + - label: Multi-Modal Processor Test # 44min timeout_in_minutes: 60 source_file_dependencies: diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/correctness/test_mteb_embed.py index 1601c18d9b78..12a4875bdacf 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_embed.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_embed.py @@ -4,10 +4,9 @@ import os import pytest -from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS, - MTEB_EMBED_TOL, - OpenAIClientMtebEncoder, - run_mteb_embed_task) +from tests.models.language.pooling_mteb_test.mteb_utils import ( + MTEB_EMBED_TASKS, MTEB_EMBED_TOL, OpenAIClientMtebEncoder, + run_mteb_embed_task) from tests.utils import RemoteOpenAIServer os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py index 417f85adc6e0..7c059d16b386 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_score.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py @@ -4,15 +4,9 @@ import os import pytest -# yapf conflicts with isort for this block -# yapf: disable -from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS, - MTEB_RERANK_TASKS, - MTEB_RERANK_TOL, - RerankClientMtebEncoder, - ScoreClientMtebEncoder, - run_mteb_rerank) -# yapf: enable +from tests.models.language.pooling_mteb_test.mteb_utils import ( + MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL, + RerankClientMtebEncoder, ScoreClientMtebEncoder, run_mteb_rerank) from tests.utils import RemoteOpenAIServer os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" diff --git a/tests/models/language/pooling_mteb_test/__init__.py b/tests/models/language/pooling_mteb_test/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py similarity index 100% rename from tests/models/language/pooling/mteb_utils.py rename to tests/models/language/pooling_mteb_test/mteb_utils.py diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py similarity index 93% rename from tests/models/language/pooling/test_baai.py rename to tests/models/language/pooling_mteb_test/test_baai.py index be8cb6fa7699..e131c9b1038d 100644 --- a/tests/models/language/pooling/test_baai.py +++ b/tests/models/language/pooling_mteb_test/test_baai.py @@ -2,10 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, - EmbedModelInfo, LASTPoolingEmbedModelInfo, - RerankModelInfo) -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import (CLSPoolingEmbedModelInfo, + CLSPoolingRerankModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo, RerankModelInfo) + from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py similarity index 96% rename from tests/models/language/pooling/test_bge_reranker_v2_gemma.py rename to tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index fc888157b402..1eca2a2c0abd 100644 --- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -7,9 +7,9 @@ import pytest import torch from tests.conftest import HfRunner - -from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo -from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models +from tests.models.language.pooling_mteb_test.mteb_utils import ( + VllmMtebEncoder, mteb_test_rerank_models) +from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo RERANK_MODELS = [ LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma", diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling_mteb_test/test_cross_encoder.py similarity index 85% rename from tests/models/language/pooling/test_cross_encoder.py rename to tests/models/language/pooling_mteb_test/test_cross_encoder.py index b49908c9ce6a..ad320fae0c85 100644 --- a/tests/models/language/pooling/test_cross_encoder.py +++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import (CLSPoolingRerankModelInfo, LASTPoolingRerankModelInfo, - RerankModelInfo) +from tests.models.utils import (CLSPoolingRerankModelInfo, + LASTPoolingRerankModelInfo, RerankModelInfo) + from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py similarity index 94% rename from tests/models/language/pooling/test_gte.py rename to tests/models/language/pooling_mteb_test/test_gte.py index 98d215b0ad25..9ae43fd05bf7 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling_mteb_test/test_gte.py @@ -3,10 +3,12 @@ import pytest -from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, - EmbedModelInfo, LASTPoolingEmbedModelInfo, - RerankModelInfo) -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import (CLSPoolingEmbedModelInfo, + CLSPoolingRerankModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo, RerankModelInfo) + from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling_mteb_test/test_intfloat.py similarity index 92% rename from tests/models/language/pooling/test_intfloat.py rename to tests/models/language/pooling_mteb_test/test_intfloat.py index bc95475836e8..0d6026898ad4 100644 --- a/tests/models/language/pooling/test_intfloat.py +++ b/tests/models/language/pooling_mteb_test/test_intfloat.py @@ -2,8 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo + from .mteb_utils import mteb_test_embed_models MODELS = [ diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py similarity index 92% rename from tests/models/language/pooling/test_jina.py rename to tests/models/language/pooling_mteb_test/test_jina.py index c4e4835556a5..0a77a78bb31b 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling_mteb_test/test_jina.py @@ -4,12 +4,13 @@ from functools import partial import pytest +from tests.models.language.pooling.embed_utils import ( + check_embeddings_close, correctness_test_embed_models, matryoshka_fy) +from tests.models.utils import (CLSPoolingEmbedModelInfo, + CLSPoolingRerankModelInfo, EmbedModelInfo, + RerankModelInfo) from vllm import PoolingParams -from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, - EmbedModelInfo, RerankModelInfo) -from .embed_utils import (check_embeddings_close, - correctness_test_embed_models, matryoshka_fy) from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models EMBEDDING_MODELS = [ diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py similarity index 97% rename from tests/models/language/pooling/test_mxbai_rerank.py rename to tests/models/language/pooling_mteb_test/test_mxbai_rerank.py index 1731c6ae6fff..05ebb4ec4d3f 100644 --- a/tests/models/language/pooling/test_mxbai_rerank.py +++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py @@ -6,8 +6,8 @@ import pytest import torch from tests.conftest import HfRunner +from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo -from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo from .mteb_utils import mteb_test_rerank_models mxbai_rerank_hf_overrides = { diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling_mteb_test/test_nomic.py similarity index 90% rename from tests/models/language/pooling/test_nomic.py rename to tests/models/language/pooling_mteb_test/test_nomic.py index 52a8ce6e6671..61512fd0dff1 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling_mteb_test/test_nomic.py @@ -3,8 +3,10 @@ import pytest -from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo + from .mteb_utils import mteb_test_embed_models MODELS = [ diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py similarity index 98% rename from tests/models/language/pooling/test_qwen3_reranker.py rename to tests/models/language/pooling_mteb_test/test_qwen3_reranker.py index ebdacf9d0c67..65403081dc0f 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py @@ -6,9 +6,9 @@ import pytest import torch from tests.conftest import HfRunner +from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.utils import multi_gpu_test -from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo from .mteb_utils import mteb_test_rerank_models qwen3_reranker_hf_overrides = { diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py similarity index 94% rename from tests/models/language/pooling/test_snowflake_arctic_embed.py rename to tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py index 864f3d75ef5a..91bad2c4e42f 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py @@ -3,8 +3,10 @@ import pytest -from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo + from .mteb_utils import mteb_test_embed_models MODELS = [ diff --git a/tests/models/language/pooling/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py similarity index 86% rename from tests/models/language/pooling/test_st_projector.py rename to tests/models/language/pooling_mteb_test/test_st_projector.py index 9301e705c433..bd493e7e2ba0 100644 --- a/tests/models/language/pooling/test_st_projector.py +++ b/tests/models/language/pooling_mteb_test/test_st_projector.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo) +from tests.models.utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo) + from .mteb_utils import mteb_test_embed_models # ST models with projector (Dense) layers