diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b633625a77e30..adb5c862eecd9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -571,36 +571,85 @@ steps: ##### models test ##### -- label: Basic Models Test # 57min - timeout_in_minutes: 75 +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - - tests/models + - tests/models/test_initialization.py commands: - - pytest -v -s models/test_transformers.py - - pytest -v -s models/test_registry.py - - pytest -v -s models/test_utils.py - - pytest -v -s models/test_vision.py - - pytest -v -s models/test_initialization.py + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset -- label: Language Models Test (Standard) # 35min +- label: Basic Models Tests (Extra Initialization) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + - tests/models/test_utils.py + - tests/models/test_vision.py + commands: + - pytest -v -s models/test_transformers.py \ + models/test_registry.py \ + models/test_utils.py \ + models/test_vision.py + +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: - vllm/ - tests/models/language commands: + # Test standard language models, excluding a subset of slow tests - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m core_model + - pytest -v -s models/language -m 'core_model and (not slow_test)' -- label: Language Models Test (Hybrid) # 35 min +- label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: - vllm/ - tests/models/language/generation commands: @@ -608,7 +657,12 @@ steps: # Note: also needed to run plamo2 model in vLLM - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation -m hybrid_model + # Shard hybrid language model tests + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 - label: Language Models Test (Extended Generation) # 80min timeout_in_minutes: 110 diff --git a/pyproject.toml b/pyproject.toml index 416423abcad88..f5a44f56f416e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -145,6 +145,7 @@ skip_gitignore = true [tool.pytest.ini_options] markers = [ + "slow_test", "skip_global_cleanup", "core_model: enable this model test in each PR instead of only nightly", "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)", diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index a39f24c80f1c5..a5aa1e3f49743 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -38,7 +38,7 @@ AITER_MODEL_LIST = [ [ pytest.param( "bigscience/bloom-560m", # bloom - testing alibi slopes - marks=[pytest.mark.core_model], + marks=[pytest.mark.core_model, pytest.mark.slow_test], ), pytest.param( "openai-community/gpt2", # gpt2 @@ -49,7 +49,10 @@ AITER_MODEL_LIST = [ pytest.param("EleutherAI/pythia-70m"), # gpt_neox pytest.param( "google/gemma-1.1-2b-it", # gemma - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.slow_test + ], ), pytest.param( "zai-org/chatglm3-6b", # chatglm (text-only) @@ -70,14 +73,17 @@ AITER_MODEL_LIST = [ ), pytest.param( "microsoft/phi-2", # phi - marks=[pytest.mark.core_model], + marks=[pytest.mark.core_model, pytest.mark.slow_test], ), pytest.param( "Qwen/Qwen-7B-Chat", # qwen (text-only) ), pytest.param( "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.slow_test + ], ), pytest.param( "Qwen/Qwen3-8B", # qwen (text-only) diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py index c71fa96275335..8e398830d39df 100644 --- a/tests/models/language/pooling/test_classification.py +++ b/tests/models/language/pooling/test_classification.py @@ -11,7 +11,10 @@ from vllm.platforms import current_platform "model", [ pytest.param("jason9693/Qwen2.5-1.5B-apeach", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.slow_test + ]), ], ) @pytest.mark.parametrize("dtype", diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 41574b844a668..d61ac08475e3c 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -19,7 +19,7 @@ from ...utils import check_embeddings_close # model code with bidirectional attention. # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", - marks=[pytest.mark.core_model]), + marks=[pytest.mark.core_model, pytest.mark.slow_test]), pytest.param( "intfloat/e5-mistral-7b-instruct", # CPU v1 doesn't support sliding window @@ -29,7 +29,10 @@ from ...utils import check_embeddings_close # [Encoder-only] pytest.param( "BAAI/bge-base-en-v1.5", - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.slow_test + ], ), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 3b13a12276f5d..c22d94948d249 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -18,6 +18,26 @@ from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels) from .utils import dummy_hf_overrides +# This minimal list of model architectures is smaller than the total list of +# supported models. The intention is that in the "typical" regression testing +# scenario, we only test initializing these models. This subset was chosen +# to include representative examples of model varieties/workloads (conditional +# generation, sequence classification, causal LM, ranking, chat, reward model, +# multimodal, geospatial, voice, embedding, MTP) +MINIMAL_MODEL_ARCH_LIST = [ + "LlavaForConditionalGeneration", "Llama4ForConditionalGeneration", + "BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking", + "InternVLChatModel", "InternLM2ForRewardModel", + "TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel", + "DeepSeekMTPModel", "XLMRobertaModel" +] + +# This list is the complement of the minimal list above. The intention is that +# this list of models is only tested in a "special case" i.e. most PRs should +# not test these models +OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) - + set(MINIMAL_MODEL_ARCH_LIST)) + @create_new_process_for_each_test() def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, @@ -101,8 +121,23 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, max_num_seqs=model_info.max_num_seqs) -@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) -def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): +@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST) +def test_can_initialize_small_subset(model_arch: str, + monkeypatch: pytest.MonkeyPatch): + """Test initializing small subset of supported models""" + if model_arch == "Lfm2ForCausalLM": + pytest.skip("Skipping until test supports V1-only models") + can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS) + + +@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST) +def test_can_initialize_large_subset(model_arch: str, + monkeypatch: pytest.MonkeyPatch): + """Test initializing large subset of supported models + + This test covers the complement of the tests covered in the "small subset" + test. + """ if model_arch == "Lfm2ForCausalLM": pytest.skip("Skipping until test supports V1-only models") can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)