mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-28 05:37:04 +08:00
[CI] Speed up model unit tests in CI (#24253)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
This commit is contained in:
parent
9d2a44606d
commit
c8c42597ab
@ -571,36 +571,85 @@ steps:
|
|||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Test # 57min
|
- label: Basic Models Tests (Initialization)
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models/test_initialization.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py
|
# Run a subset of model initialization tests
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||||
- pytest -v -s models/test_utils.py
|
|
||||||
- pytest -v -s models/test_vision.py
|
|
||||||
- pytest -v -s models/test_initialization.py
|
|
||||||
|
|
||||||
- label: Language Models Test (Standard) # 35min
|
- label: Basic Models Tests (Extra Initialization) %N
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/models/test_initialization.py
|
||||||
|
commands:
|
||||||
|
# Only when vLLM model source is modified - test initialization of a large
|
||||||
|
# subset of supported models (the complement of the small subset in the above
|
||||||
|
# test.) Also run if model initialization test file is modified
|
||||||
|
- pytest -v -s models/test_initialization.py \
|
||||||
|
-k 'not test_can_initialize_small_subset' \
|
||||||
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
|
--shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Basic Models Tests (Other)
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/test_transformers.py
|
||||||
|
- tests/models/test_registry.py
|
||||||
|
- tests/models/test_utils.py
|
||||||
|
- tests/models/test_vision.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/test_transformers.py \
|
||||||
|
models/test_registry.py \
|
||||||
|
models/test_utils.py \
|
||||||
|
models/test_vision.py
|
||||||
|
|
||||||
|
- label: Language Models Tests (Standard)
|
||||||
|
timeout_in_minutes: 25
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language
|
- tests/models/language
|
||||||
commands:
|
commands:
|
||||||
|
# Test standard language models, excluding a subset of slow tests
|
||||||
- pip freeze | grep -E 'torch'
|
- pip freeze | grep -E 'torch'
|
||||||
- pytest -v -s models/language -m core_model
|
- pytest -v -s models/language -m 'core_model and (not slow_test)'
|
||||||
|
|
||||||
- label: Language Models Test (Hybrid) # 35 min
|
- label: Language Models Tests (Extra Standard) %N
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/models/language/pooling/test_embedding.py
|
||||||
|
- tests/models/language/generation/test_common.py
|
||||||
|
- tests/models/language/pooling/test_classification.py
|
||||||
|
commands:
|
||||||
|
# Shard slow subset of standard language models tests. Only run when model
|
||||||
|
# source is modified, or when specified test files are modified
|
||||||
|
- pip freeze | grep -E 'torch'
|
||||||
|
- pytest -v -s models/language -m 'core_model and slow_test' \
|
||||||
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
|
--shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Language Models Tests (Hybrid) %N
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/generation
|
- tests/models/language/generation
|
||||||
commands:
|
commands:
|
||||||
@ -608,7 +657,12 @@ steps:
|
|||||||
# Note: also needed to run plamo2 model in vLLM
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
- pytest -v -s models/language/generation -m hybrid_model
|
# Shard hybrid language model tests
|
||||||
|
- pytest -v -s models/language/generation \
|
||||||
|
-m hybrid_model \
|
||||||
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
|
--shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
- label: Language Models Test (Extended Generation) # 80min
|
- label: Language Models Test (Extended Generation) # 80min
|
||||||
timeout_in_minutes: 110
|
timeout_in_minutes: 110
|
||||||
|
|||||||
@ -145,6 +145,7 @@ skip_gitignore = true
|
|||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
markers = [
|
markers = [
|
||||||
|
"slow_test",
|
||||||
"skip_global_cleanup",
|
"skip_global_cleanup",
|
||||||
"core_model: enable this model test in each PR instead of only nightly",
|
"core_model: enable this model test in each PR instead of only nightly",
|
||||||
"hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
|
"hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
|
||||||
|
|||||||
@ -38,7 +38,7 @@ AITER_MODEL_LIST = [
|
|||||||
[
|
[
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"bigscience/bloom-560m", # bloom - testing alibi slopes
|
"bigscience/bloom-560m", # bloom - testing alibi slopes
|
||||||
marks=[pytest.mark.core_model],
|
marks=[pytest.mark.core_model, pytest.mark.slow_test],
|
||||||
),
|
),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"openai-community/gpt2", # gpt2
|
"openai-community/gpt2", # gpt2
|
||||||
@ -49,7 +49,10 @@ AITER_MODEL_LIST = [
|
|||||||
pytest.param("EleutherAI/pythia-70m"), # gpt_neox
|
pytest.param("EleutherAI/pythia-70m"), # gpt_neox
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"google/gemma-1.1-2b-it", # gemma
|
"google/gemma-1.1-2b-it", # gemma
|
||||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
marks=[
|
||||||
|
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||||
|
pytest.mark.slow_test
|
||||||
|
],
|
||||||
),
|
),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"zai-org/chatglm3-6b", # chatglm (text-only)
|
"zai-org/chatglm3-6b", # chatglm (text-only)
|
||||||
@ -70,14 +73,17 @@ AITER_MODEL_LIST = [
|
|||||||
),
|
),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"microsoft/phi-2", # phi
|
"microsoft/phi-2", # phi
|
||||||
marks=[pytest.mark.core_model],
|
marks=[pytest.mark.core_model, pytest.mark.slow_test],
|
||||||
),
|
),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"Qwen/Qwen-7B-Chat", # qwen (text-only)
|
"Qwen/Qwen-7B-Chat", # qwen (text-only)
|
||||||
),
|
),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
|
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
|
||||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
marks=[
|
||||||
|
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||||
|
pytest.mark.slow_test
|
||||||
|
],
|
||||||
),
|
),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"Qwen/Qwen3-8B", # qwen (text-only)
|
"Qwen/Qwen3-8B", # qwen (text-only)
|
||||||
|
|||||||
@ -11,7 +11,10 @@ from vllm.platforms import current_platform
|
|||||||
"model",
|
"model",
|
||||||
[
|
[
|
||||||
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
|
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
|
||||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
marks=[
|
||||||
|
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||||
|
pytest.mark.slow_test
|
||||||
|
]),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("dtype",
|
@pytest.mark.parametrize("dtype",
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from ...utils import check_embeddings_close
|
|||||||
# model code with bidirectional attention.
|
# model code with bidirectional attention.
|
||||||
# [Decoder-only]
|
# [Decoder-only]
|
||||||
pytest.param("BAAI/bge-multilingual-gemma2",
|
pytest.param("BAAI/bge-multilingual-gemma2",
|
||||||
marks=[pytest.mark.core_model]),
|
marks=[pytest.mark.core_model, pytest.mark.slow_test]),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"intfloat/e5-mistral-7b-instruct",
|
"intfloat/e5-mistral-7b-instruct",
|
||||||
# CPU v1 doesn't support sliding window
|
# CPU v1 doesn't support sliding window
|
||||||
@ -29,7 +29,10 @@ from ...utils import check_embeddings_close
|
|||||||
# [Encoder-only]
|
# [Encoder-only]
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"BAAI/bge-base-en-v1.5",
|
"BAAI/bge-base-en-v1.5",
|
||||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
marks=[
|
||||||
|
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||||
|
pytest.mark.slow_test
|
||||||
|
],
|
||||||
),
|
),
|
||||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||||
pytest.param("intfloat/multilingual-e5-small"),
|
pytest.param("intfloat/multilingual-e5-small"),
|
||||||
|
|||||||
@ -18,6 +18,26 @@ from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
|
|||||||
HF_EXAMPLE_MODELS, HfExampleModels)
|
HF_EXAMPLE_MODELS, HfExampleModels)
|
||||||
from .utils import dummy_hf_overrides
|
from .utils import dummy_hf_overrides
|
||||||
|
|
||||||
|
# This minimal list of model architectures is smaller than the total list of
|
||||||
|
# supported models. The intention is that in the "typical" regression testing
|
||||||
|
# scenario, we only test initializing these models. This subset was chosen
|
||||||
|
# to include representative examples of model varieties/workloads (conditional
|
||||||
|
# generation, sequence classification, causal LM, ranking, chat, reward model,
|
||||||
|
# multimodal, geospatial, voice, embedding, MTP)
|
||||||
|
MINIMAL_MODEL_ARCH_LIST = [
|
||||||
|
"LlavaForConditionalGeneration", "Llama4ForConditionalGeneration",
|
||||||
|
"BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking",
|
||||||
|
"InternVLChatModel", "InternLM2ForRewardModel",
|
||||||
|
"TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel",
|
||||||
|
"DeepSeekMTPModel", "XLMRobertaModel"
|
||||||
|
]
|
||||||
|
|
||||||
|
# This list is the complement of the minimal list above. The intention is that
|
||||||
|
# this list of models is only tested in a "special case" i.e. most PRs should
|
||||||
|
# not test these models
|
||||||
|
OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) -
|
||||||
|
set(MINIMAL_MODEL_ARCH_LIST))
|
||||||
|
|
||||||
|
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||||
@ -101,8 +121,23 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
|||||||
max_num_seqs=model_info.max_num_seqs)
|
max_num_seqs=model_info.max_num_seqs)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
|
@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
|
||||||
def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
def test_can_initialize_small_subset(model_arch: str,
|
||||||
|
monkeypatch: pytest.MonkeyPatch):
|
||||||
|
"""Test initializing small subset of supported models"""
|
||||||
|
if model_arch == "Lfm2ForCausalLM":
|
||||||
|
pytest.skip("Skipping until test supports V1-only models")
|
||||||
|
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST)
|
||||||
|
def test_can_initialize_large_subset(model_arch: str,
|
||||||
|
monkeypatch: pytest.MonkeyPatch):
|
||||||
|
"""Test initializing large subset of supported models
|
||||||
|
|
||||||
|
This test covers the complement of the tests covered in the "small subset"
|
||||||
|
test.
|
||||||
|
"""
|
||||||
if model_arch == "Lfm2ForCausalLM":
|
if model_arch == "Lfm2ForCausalLM":
|
||||||
pytest.skip("Skipping until test supports V1-only models")
|
pytest.skip("Skipping until test supports V1-only models")
|
||||||
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
|
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user