Add SSM and Hybrid Models Test

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Tyler Michael Smith 2025-03-15 17:26:01 +00:00
parent 61c6a5a796
commit 696245c2fc
2 changed files with 571 additions and 566 deletions

View File

@ -28,9 +28,9 @@
# Note that all steps execute in parallel. # Note that all steps execute in parallel.
steps: steps:
##### fast check tests ##### ##### fast check tests #####
- label: Documentation Build # 2min - label: Documentation Build # 2min
working_dir: "/vllm-workspace/test_docs/docs" working_dir: "/vllm-workspace/test_docs/docs"
fast_check: true fast_check: true
no_gpu: True no_gpu: True
@ -40,7 +40,7 @@ steps:
# Check API reference (if it fails, you may have missing mock imports) # Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/api/inference_params.html - grep \"sig sig-object py\" build/html/api/inference_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min - label: Async Engine, Inputs, Utils, Worker Test # 24min
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/mq_llm_engine - tests/mq_llm_engine
@ -60,14 +60,14 @@ steps:
- pytest -v -s test_utils.py # Utils - pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker - pytest -v -s worker # Worker
- label: Python-only Installation Test - label: Python-only Installation Test
source_file_dependencies: source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh - tests/standalone_tests/python_only_compile.sh
- setup.py - setup.py
commands: commands:
- bash standalone_tests/python_only_compile.sh - bash standalone_tests/python_only_compile.sh
- label: Basic Correctness Test # 30min - label: Basic Correctness Test # 30min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
@ -83,7 +83,7 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test - label: Chunked Prefill Test
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness/test_chunked_prefill - tests/basic_correctness/test_chunked_prefill
@ -91,7 +91,7 @@ steps:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min - label: Core Test # 10min
mirror_hardwares: [amd] mirror_hardwares: [amd]
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
@ -101,7 +101,7 @@ steps:
commands: commands:
- pytest -v -s core - pytest -v -s core
- label: Entrypoints Test # 40min - label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
mirror_hardwares: [amd] mirror_hardwares: [amd]
@ -122,7 +122,7 @@ steps:
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min - label: Distributed Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -148,7 +148,7 @@ steps:
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd - popd
- label: Metrics, Tracing Test # 10min - label: Metrics, Tracing Test # 10min
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -163,10 +163,10 @@ steps:
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'" 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
- pytest -v -s tracing - pytest -v -s tracing
##### fast check tests ##### ##### fast check tests #####
##### 1 GPU test ##### ##### 1 GPU test #####
- label: Regression Test # 5min - label: Regression Test # 5min
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -176,7 +176,7 @@ steps:
- pytest -v -s test_regression.py - pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min - label: Engine Test # 10min
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -190,7 +190,7 @@ steps:
# OOM in the CI unless we run this separately # OOM in the CI unless we run this separately
- pytest -v -s tokenization - pytest -v -s tokenization
- label: V1 Test - label: V1 Test
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -212,7 +212,7 @@ steps:
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Examples Test # 25min - label: Examples Test # 25min
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
@ -234,7 +234,7 @@ steps:
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min - label: Prefix Caching Test # 9min
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -242,7 +242,7 @@ steps:
commands: commands:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Samplers Test # 36min - label: Samplers Test # 36min
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/layers - vllm/model_executor/layers
- vllm/sampling_metadata.py - vllm/sampling_metadata.py
@ -252,7 +252,7 @@ steps:
- pytest -v -s samplers - pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test # 5min - label: LogitsProcessor Test # 5min
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/layers - vllm/model_executor/layers
@ -263,7 +263,7 @@ steps:
- pytest -v -s test_logits_processor.py - pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py - pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 40min - label: Speculative decoding tests # 40min
source_file_dependencies: source_file_dependencies:
- vllm/spec_decode - vllm/spec_decode
- tests/spec_decode - tests/spec_decode
@ -273,7 +273,7 @@ steps:
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each - label: LoRA Test %N # 15min each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
@ -281,7 +281,7 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
parallelism: 4 parallelism: 4
- label: PyTorch Fullgraph Smoke Test # 9min - label: PyTorch Fullgraph Smoke Test # 9min
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
@ -291,14 +291,14 @@ steps:
- pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_toy_llama.py
- label: PyTorch Fullgraph Test # 18min - label: PyTorch Fullgraph Test # 18min
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_full_graph.py - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 1h each - label: Kernels Test %N # 1h each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
@ -308,7 +308,7 @@ steps:
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4 parallelism: 4
- label: Tensorizer Test # 11min - label: Tensorizer Test # 11min
mirror_hardwares: [amd] mirror_hardwares: [amd]
soft_fail: true soft_fail: true
source_file_dependencies: source_file_dependencies:
@ -319,7 +319,7 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader - pytest -v -s tensorizer_loader
- label: Benchmarks # 9min - label: Benchmarks # 9min
working_dir: "/vllm-workspace/.buildkite" working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
@ -327,14 +327,14 @@ steps:
commands: commands:
- bash run-benchmarks.sh - bash run-benchmarks.sh
- label: Quantization Test # 33min - label: Quantization Test # 33min
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
@ -343,7 +343,7 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1 - bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: OpenAI API correctness - label: OpenAI API correctness
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/entrypoints/openai/ - vllm/entrypoints/openai/
@ -351,25 +351,25 @@ steps:
commands: # LMEval+Transcription WER check commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/
- label: Encoder Decoder tests # 5min - label: Encoder Decoder tests # 5min
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/encoder_decoder - tests/encoder_decoder
commands: commands:
- pytest -v -s encoder_decoder - pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 20 min - label: OpenAI-Compatible Tool Use # 20 min
fast_check: false fast_check: false
mirror_hardwares: [ amd ] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/tool_use - tests/tool_use
commands: commands:
- pytest -v -s tool_use - pytest -v -s tool_use
##### models test ##### ##### models test #####
- label: Basic Models Test # 24min - label: Basic Models Test # 24min
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models - tests/models
@ -379,7 +379,7 @@ steps:
# V1 Test: https://github.com/vllm-project/vllm/issues/14531 # V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) # 32min - label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -390,7 +390,16 @@ steps:
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pytest -v -s models/embedding/language -m core_model - pytest -v -s models/embedding/language -m core_model
- label: Language Models Test (Extended) # 1h10min - label: SSM and Hybrid Models Test # X min
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language/test_hybrid.py
- tests/models/decoder_only/language/test_mamba.py
commands:
- pytest -v -s models/decoder_only/language/test_hybrid.py
- pytest -v -s models/decoder_only/language/test_mamba.py
- label: Language Models Test (Extended) # 1h10min
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -401,7 +410,7 @@ steps:
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/language -m 'not core_model' - pytest -v -s models/embedding/language -m 'not core_model'
- label: Multi-Modal Models Test (Standard) # 40min - label: Multi-Modal Models Test (Standard) # 40min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -420,7 +429,7 @@ steps:
- pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model
- label: Multi-Modal Models Test (Extended) 1 # 48m - label: Multi-Modal Models Test (Extended) 1 # 48m
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -440,7 +449,7 @@ steps:
- pytest -v -s models/encoder_decoder/language -m 'not core_model' - pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Extended) 2 # 38m - label: Multi-Modal Models Test (Extended) 2 # 38m
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -449,8 +458,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model' - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
# This test is used only in PR development phase to test individual models and should never run on main # This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test - label: Custom Models Test
optional: true optional: true
commands: commands:
- echo 'Testing custom models...' - echo 'Testing custom models...'
@ -458,10 +467,10 @@ steps:
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min - label: Distributed Comm Ops Test # 7min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -471,7 +480,7 @@ steps:
- pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py - pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total) # 16min - label: 2 Node Tests (4 GPUs in total) # 16min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
num_nodes: 2 num_nodes: 2
@ -489,7 +498,7 @@ steps:
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 40min - label: Distributed Tests (2 GPUs) # 40min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
@ -523,7 +532,7 @@ steps:
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -542,7 +551,7 @@ steps:
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process
- label: Multi-step Tests (4 GPUs) # 36min - label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -562,7 +571,7 @@ steps:
# - pytest -v -s multi_step/test_correctness_async_llm.py # - pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py - pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 45min - label: Pipeline Parallelism Test # 45min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -575,7 +584,7 @@ steps:
- pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py - pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA TP Test (Distributed) - label: LoRA TP Test (Distributed)
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
@ -593,8 +602,7 @@ steps:
- pytest -v -s -x lora/test_minicpmv_tp.py - pytest -v -s -x lora/test_minicpmv_tp.py
- pytest -v -s -x lora/test_transfomers_model.py - pytest -v -s -x lora/test_transfomers_model.py
- label: Weight Loading Multiple GPU Test # 33min
- label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -603,7 +611,7 @@ steps:
commands: commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional - label: Weight Loading Multiple GPU Test - Large Models # optional
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
gpu: a100 gpu: a100
@ -614,11 +622,10 @@ steps:
commands: commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
##### multi gpus test #####
##### A100 test #####
##### multi gpus test ##### - label: Distributed Tests (A100) # optional
##### A100 test #####
- label: Distributed Tests (A100) # optional
gpu: a100 gpu: a100
optional: true optional: true
num_gpus: 4 num_gpus: 4
@ -632,7 +639,7 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional - label: LM Eval Large Models # optional
gpu: a100 gpu: a100
optional: true optional: true
num_gpus: 4 num_gpus: 4

View File

@ -38,8 +38,6 @@ from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix)
KVCache = Tuple[torch.Tensor, torch.Tensor]
class BambaMLP(nn.Module): class BambaMLP(nn.Module):