Add SSM and Hybrid Models Test

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2026-06-09 01:15:42 +08:00 · 2025-03-15 17:26:01 +00:00 · 2025-03-15 17:26:01 +00:00 · 696245c2fc
commit 696245c2fc
parent 61c6a5a796
2 changed files with 571 additions and 566 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -28,9 +28,9 @@
 #   Note that all steps execute in parallel.
 steps:
-##### fast check tests  #####
+    ##### fast check tests  #####
- label: Documentation Build # 2min
+    - label: Documentation Build # 2min
      working_dir: "/vllm-workspace/test_docs/docs"
      fast_check: true
      no_gpu: True
@ -40,7 +40,7 @@ steps:
          # Check API reference (if it fails, you may have missing mock imports)
          - grep \"sig sig-object py\" build/html/api/inference_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min
+    - label: Async Engine, Inputs, Utils, Worker Test # 24min
      source_file_dependencies:
          - vllm/
          - tests/mq_llm_engine
@ -60,14 +60,14 @@ steps:
          - pytest -v -s test_utils.py # Utils
          - pytest -v -s worker # Worker
- label: Python-only Installation Test
+    - label: Python-only Installation Test
      source_file_dependencies:
          - tests/standalone_tests/python_only_compile.sh
          - setup.py
      commands:
          - bash standalone_tests/python_only_compile.sh
- label: Basic Correctness Test # 30min
+    - label: Basic Correctness Test # 30min
      #mirror_hardwares: [amd]
      fast_check: true
      source_file_dependencies:
@ -83,7 +83,7 @@ steps:
          - pytest -v -s basic_correctness/test_cpu_offload.py
          - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test
+    - label: Chunked Prefill Test
      source_file_dependencies:
          - vllm/
          - tests/basic_correctness/test_chunked_prefill
@ -91,7 +91,7 @@ steps:
          - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
          - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min
+    - label: Core Test # 10min
      mirror_hardwares: [amd]
      fast_check: true
      source_file_dependencies:
@ -101,7 +101,7 @@ steps:
      commands:
          - pytest -v -s core
- label: Entrypoints Test # 40min
+    - label: Entrypoints Test # 40min
      working_dir: "/vllm-workspace/tests"
      fast_check: true
      mirror_hardwares: [amd]
@ -122,7 +122,7 @@ steps:
          - pytest -v -s entrypoints/test_chat_utils.py
          - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min
+    - label: Distributed Tests (4 GPUs) # 10min
      working_dir: "/vllm-workspace/tests"
      num_gpus: 4
      source_file_dependencies:
@ -148,7 +148,7 @@ steps:
          - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
          - popd
- label: Metrics, Tracing Test # 10min
+    - label: Metrics, Tracing Test # 10min
      num_gpus: 2
      source_file_dependencies:
          - vllm/
@ -163,10 +163,10 @@ steps:
            'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
          - pytest -v -s tracing
-##### fast check tests  #####
+    ##### fast check tests  #####
-#####  1 GPU test  #####
+    #####  1 GPU test  #####
- label: Regression Test # 5min
+    - label: Regression Test # 5min
      mirror_hardwares: [amd]
      source_file_dependencies:
          - vllm/
@ -176,7 +176,7 @@ steps:
          - pytest -v -s test_regression.py
      working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min
+    - label: Engine Test # 10min
      mirror_hardwares: [amd]
      source_file_dependencies:
          - vllm/
@ -190,7 +190,7 @@ steps:
          # OOM in the CI unless we run this separately
          - pytest -v -s tokenization
- label: V1 Test
+    - label: V1 Test
      #mirror_hardwares: [amd]
      source_file_dependencies:
          - vllm/
@ -212,7 +212,7 @@ steps:
          - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
          - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Examples Test # 25min
+    - label: Examples Test # 25min
      working_dir: "/vllm-workspace/examples"
      #mirror_hardwares: [amd]
      source_file_dependencies:
@ -234,7 +234,7 @@ steps:
          - python3 offline_inference/basic/score.py
          - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min
+    - label: Prefix Caching Test # 9min
      mirror_hardwares: [amd]
      source_file_dependencies:
          - vllm/
@ -242,7 +242,7 @@ steps:
      commands:
          - pytest -v -s prefix_caching
- label: Samplers Test # 36min
+    - label: Samplers Test # 36min
      source_file_dependencies:
          - vllm/model_executor/layers
          - vllm/sampling_metadata.py
@ -252,7 +252,7 @@ steps:
          - pytest -v -s samplers
          - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test # 5min
+    - label: LogitsProcessor Test # 5min
      mirror_hardwares: [amd]
      source_file_dependencies:
          - vllm/model_executor/layers
@ -263,7 +263,7 @@ steps:
          - pytest -v -s test_logits_processor.py
          - pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 40min
+    - label: Speculative decoding tests # 40min
      source_file_dependencies:
          - vllm/spec_decode
          - tests/spec_decode
@ -273,7 +273,7 @@ steps:
          - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
          - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each
+    - label: LoRA Test %N # 15min each
      mirror_hardwares: [amd]
      source_file_dependencies:
          - vllm/lora
@ -281,7 +281,7 @@ steps:
      command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
      parallelism: 4
- label: PyTorch Fullgraph Smoke Test # 9min
+    - label: PyTorch Fullgraph Smoke Test # 9min
      source_file_dependencies:
          - vllm/
          - tests/compile
@ -291,14 +291,14 @@ steps:
          - pytest -v -s compile/piecewise/test_simple.py
          - pytest -v -s compile/piecewise/test_toy_llama.py
- label: PyTorch Fullgraph Test # 18min
+    - label: PyTorch Fullgraph Test # 18min
      source_file_dependencies:
          - vllm/
          - tests/compile
      commands:
          - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 1h each
+    - label: Kernels Test %N # 1h each
      mirror_hardwares: [amd]
      source_file_dependencies:
          - csrc/
@ -308,7 +308,7 @@ steps:
          - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
      parallelism: 4
- label: Tensorizer Test # 11min
+    - label: Tensorizer Test # 11min
      mirror_hardwares: [amd]
      soft_fail: true
      source_file_dependencies:
@ -319,7 +319,7 @@ steps:
          - export VLLM_WORKER_MULTIPROC_METHOD=spawn
          - pytest -v -s tensorizer_loader
- label: Benchmarks # 9min
+    - label: Benchmarks # 9min
      working_dir: "/vllm-workspace/.buildkite"
      mirror_hardwares: [amd]
      source_file_dependencies:
@ -327,14 +327,14 @@ steps:
      commands:
          - bash run-benchmarks.sh
- label: Quantization Test # 33min
+    - label: Quantization Test # 33min
      source_file_dependencies:
          - csrc/
          - vllm/model_executor/layers/quantization
          - tests/quantization
      command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min
+    - label: LM Eval Small Models # 53min
      working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
      source_file_dependencies:
          - csrc/
@ -343,7 +343,7 @@ steps:
          - export VLLM_WORKER_MULTIPROC_METHOD=spawn
          - bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: OpenAI API correctness
+    - label: OpenAI API correctness
      source_file_dependencies:
          - csrc/
          - vllm/entrypoints/openai/
@ -351,25 +351,25 @@ steps:
      commands: # LMEval+Transcription WER check
          - pytest -s entrypoints/openai/correctness/
- label: Encoder Decoder tests # 5min
+    - label: Encoder Decoder tests # 5min
      source_file_dependencies:
          - vllm/
          - tests/encoder_decoder
      commands:
          - pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 20 min
+    - label: OpenAI-Compatible Tool Use # 20 min
      fast_check: false
-  mirror_hardwares: [ amd ]
+      mirror_hardwares: [amd]
      source_file_dependencies:
          - vllm/
          - tests/tool_use
      commands:
          - pytest -v -s tool_use
-#####  models test  #####
+    #####  models test  #####
- label: Basic Models Test # 24min
+    - label: Basic Models Test # 24min
      source_file_dependencies:
          - vllm/
          - tests/models
@ -379,7 +379,7 @@ steps:
          # V1 Test: https://github.com/vllm-project/vllm/issues/14531
          - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) # 32min
+    - label: Language Models Test (Standard) # 32min
      #mirror_hardwares: [amd]
      source_file_dependencies:
          - vllm/
@ -390,7 +390,16 @@ steps:
          - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
          - pytest -v -s models/embedding/language -m core_model
- label: Language Models Test (Extended) # 1h10min
+    - label: SSM and Hybrid Models Test # X min
      source_file_dependencies:
          - vllm/
          - tests/models/decoder_only/language/test_hybrid.py
          - tests/models/decoder_only/language/test_mamba.py
      commands:
          - pytest -v -s models/decoder_only/language/test_hybrid.py
          - pytest -v -s models/decoder_only/language/test_mamba.py
    - label: Language Models Test (Extended) # 1h10min
      optional: true
      source_file_dependencies:
          - vllm/
@ -401,7 +410,7 @@ steps:
          - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
          - pytest -v -s models/embedding/language -m 'not core_model'
- label: Multi-Modal Models Test (Standard) # 40min
+    - label: Multi-Modal Models Test (Standard) # 40min
      #mirror_hardwares: [amd]
      source_file_dependencies:
          - vllm/
@ -420,7 +429,7 @@ steps:
          - pytest -v -s models/encoder_decoder/language -m core_model
          - pytest -v -s models/encoder_decoder/vision_language -m core_model
- label: Multi-Modal Models Test (Extended) 1 # 48m
+    - label: Multi-Modal Models Test (Extended) 1 # 48m
      optional: true
      source_file_dependencies:
          - vllm/
@ -440,7 +449,7 @@ steps:
          - pytest -v -s models/encoder_decoder/language -m 'not core_model'
          - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Extended) 2 # 38m
+    - label: Multi-Modal Models Test (Extended) 2 # 38m
      optional: true
      source_file_dependencies:
          - vllm/
@ -449,8 +458,8 @@ steps:
          - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
          - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
-# This test is used only in PR development phase to test individual models and should never run on main
+    # This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
+    - label: Custom Models Test
      optional: true
      commands:
          - echo 'Testing custom models...'
@ -458,10 +467,10 @@ steps:
          # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
          # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
-#####  1 GPU test  #####
+    #####  1 GPU test  #####
-#####  multi gpus test  #####
+    #####  multi gpus test  #####
- label: Distributed Comm Ops Test # 7min
+    - label: Distributed Comm Ops Test # 7min
      working_dir: "/vllm-workspace/tests"
      num_gpus: 2
      source_file_dependencies:
@ -471,7 +480,7 @@ steps:
          - pytest -v -s distributed/test_comm_ops.py
          - pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total) # 16min
+    - label: 2 Node Tests (4 GPUs in total) # 16min
      working_dir: "/vllm-workspace/tests"
      num_gpus: 2
      num_nodes: 2
@ -489,7 +498,7 @@ steps:
          - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
            - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 40min
+    - label: Distributed Tests (2 GPUs) # 40min
      #mirror_hardwares: [amd]
      working_dir: "/vllm-workspace/tests"
      num_gpus: 2
@ -523,7 +532,7 @@ steps:
          - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
          - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- label: Plugin Tests (2 GPUs) # 40min
+    - label: Plugin Tests (2 GPUs) # 40min
      working_dir: "/vllm-workspace/tests"
      num_gpus: 2
      source_file_dependencies:
@ -542,7 +551,7 @@ steps:
          - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
          - pytest -v -s models/test_oot_registration.py # it needs a clean process
- label: Multi-step Tests (4 GPUs) # 36min
+    - label: Multi-step Tests (4 GPUs) # 36min
      working_dir: "/vllm-workspace/tests"
      num_gpus: 4
      source_file_dependencies:
@ -562,7 +571,7 @@ steps:
          # - pytest -v -s multi_step/test_correctness_async_llm.py
          - pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 45min
+    - label: Pipeline Parallelism Test # 45min
      working_dir: "/vllm-workspace/tests"
      num_gpus: 4
      source_file_dependencies:
@ -575,7 +584,7 @@ steps:
          - pytest -v -s distributed/test_pp_cudagraph.py
          - pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA TP Test (Distributed)
+    - label: LoRA TP Test (Distributed)
      num_gpus: 4
      source_file_dependencies:
          - vllm/lora
@ -593,8 +602,7 @@ steps:
          - pytest -v -s -x lora/test_minicpmv_tp.py
          - pytest -v -s -x lora/test_transfomers_model.py
-
+    - label: Weight Loading Multiple GPU Test # 33min
 - label: Weight Loading Multiple GPU Test  # 33min
      working_dir: "/vllm-workspace/tests"
      num_gpus: 2
      source_file_dependencies:
@ -603,7 +611,7 @@ steps:
      commands:
          - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional
+    - label: Weight Loading Multiple GPU Test - Large Models # optional
      working_dir: "/vllm-workspace/tests"
      num_gpus: 2
      gpu: a100
@ -614,11 +622,10 @@ steps:
      commands:
          - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
    ##### multi gpus test #####
    ##### A100 test #####
-##### multi gpus test #####
+    - label: Distributed Tests (A100) # optional
 ##### A100 test #####
 - label: Distributed Tests (A100) # optional
      gpu: a100
      optional: true
      num_gpus: 4
@ -632,7 +639,7 @@ steps:
          - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
          - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
+    - label: LM Eval Large Models # optional
      gpu: a100
      optional: true
      num_gpus: 4
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@ -38,8 +38,6 @@ from .utils import (is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix)
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 class BambaMLP(nn.Module):