mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 07:55:01 +08:00
[CI] Reorganize compile tests so new tests are automatically included in CI (#28625)
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
This commit is contained in:
parent
4f5299f717
commit
2c8b9182b5
@ -187,7 +187,7 @@ steps:
|
|||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
@ -215,7 +215,7 @@ steps:
|
|||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
@ -493,17 +493,12 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
# Run unit tests defined directly under compile/,
|
||||||
- pytest -v -s compile/test_fusion.py
|
# not including subdirectories, which are usually heavier
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
# tests covered elsewhere.
|
||||||
- pytest -v -s compile/test_functionalization.py
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
# - pytest -v -s compile/test_sequence_parallelism.py
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||||
# - pytest -v -s compile/test_async_tp.py
|
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s compile/test_decorator.py
|
|
||||||
- pytest -v -s compile/test_noop_elimination.py
|
|
||||||
- pytest -v -s compile/test_aot_compile.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@ -515,9 +510,11 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||||
- pytest -v -s compile/test_multimodal_compile.py
|
# as it is a heavy test that is covered in other steps.
|
||||||
- pytest -v -s compile/piecewise/
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
|
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 27min
|
- label: PyTorch Fullgraph Test # 27min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@ -529,10 +526,10 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
# Limit to no custom ops to reduce running time
|
# Limit to no custom ops to reduce running time
|
||||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
|
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
|
||||||
|
|
||||||
- label: Cudagraph test
|
- label: Cudagraph test
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@ -1066,10 +1063,10 @@ steps:
|
|||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
# Wrap with quotes to escape yaml
|
# Wrap with quotes to escape yaml
|
||||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
||||||
|
|
||||||
- label: Blackwell Fusion E2E Tests # 30 min
|
- label: Blackwell Fusion E2E Tests # 30 min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@ -1086,14 +1083,14 @@ steps:
|
|||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- tests/compile/test_fusions_e2e.py
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
- tests/compile/test_full_graph.py
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all e2e fusion tests
|
# Run all e2e fusion tests
|
||||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
- label: ROCm GPT-OSS Eval
|
- label: ROCm GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -1198,7 +1195,7 @@ steps:
|
|||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
- vllm/v1/worker/
|
- vllm/v1/worker/
|
||||||
- tests/compile/test_basic_correctness.py
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
- tests/compile/test_wrapper.py
|
- tests/compile/test_wrapper.py
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
@ -1211,7 +1208,7 @@ steps:
|
|||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
@ -1417,10 +1414,10 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/compile/test_async_tp.py
|
- pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|||||||
@ -167,7 +167,7 @@ steps:
|
|||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
@ -197,7 +197,7 @@ steps:
|
|||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
@ -445,18 +445,12 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_graph_partition.py
|
# Run unit tests defined directly under compile/,
|
||||||
- pytest -v -s compile/test_config.py
|
# not including subdirectories, which are usually heavier
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
# tests covered elsewhere.
|
||||||
- pytest -v -s compile/test_fusion.py
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
- pytest -v -s compile/test_functionalization.py
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s compile/test_decorator.py
|
|
||||||
- pytest -v -s compile/test_noop_elimination.py
|
|
||||||
- pytest -v -s compile/test_aot_compile.py
|
|
||||||
- pytest -v -s compile/test_qk_norm_rope_fusion.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@ -466,9 +460,11 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||||
- pytest -v -s compile/test_multimodal_compile.py
|
# as it is a heavy test that is covered in other steps.
|
||||||
- pytest -v -s compile/piecewise/
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
|
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 27min
|
- label: PyTorch Fullgraph Test # 27min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@ -479,10 +475,10 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||||
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
# Limit to no custom ops to reduce running time
|
# Limit to no custom ops to reduce running time
|
||||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||||
|
|
||||||
- label: Cudagraph test
|
- label: Cudagraph test
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@ -939,17 +935,22 @@ steps:
|
|||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/test_fusion_attn.py
|
||||||
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
# Wrap with quotes to escape yaml
|
# Wrap with quotes to escape yaml
|
||||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
- label: Blackwell Fusion E2E Tests # 30 min
|
- label: Blackwell Fusion E2E Tests # 30 min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@ -966,12 +967,11 @@ steps:
|
|||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- tests/compile/test_fusions_e2e.py
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
- tests/compile/test_full_graph.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all e2e fusion tests
|
# Run all e2e fusion tests
|
||||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -1069,7 +1069,7 @@ steps:
|
|||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
- vllm/v1/worker/
|
- vllm/v1/worker/
|
||||||
- tests/compile/test_basic_correctness.py
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
- tests/compile/test_wrapper.py
|
- tests/compile/test_wrapper.py
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
@ -1084,7 +1084,7 @@ steps:
|
|||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
@ -1264,10 +1264,10 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/compile/test_async_tp.py
|
- pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
|
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
|||||||
5
tests/compile/README.md
Normal file
5
tests/compile/README.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# compile test folder structure
|
||||||
|
|
||||||
|
- `compile/test_*.py` : various unit tests meant for testing particular code path/features. Future tests are most likely added here. New test files added here will be included in CI automatically
|
||||||
|
- `compile/fullgraph/` : full model tests, including all tests previously in compile/piecewise. These tests do not target particular features. New test files added here will be included in CI automatically
|
||||||
|
- `compile/distributed/` : tests that require multiple GPUs. New test files added here will **NOT** be included in CI automatically as these tests generally need to be manually configured to run in runners with particular number/type of GPUs.
|
||||||
@ -27,13 +27,13 @@ from vllm.distributed.parallel_state import (
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.system_utils import update_environment_variables
|
from vllm.utils.system_utils import update_environment_variables
|
||||||
|
|
||||||
from ..models.registry import HF_EXAMPLE_MODELS
|
from ...models.registry import HF_EXAMPLE_MODELS
|
||||||
from ..utils import (
|
from ...utils import (
|
||||||
compare_two_settings,
|
compare_two_settings,
|
||||||
create_new_process_for_each_test,
|
create_new_process_for_each_test,
|
||||||
multi_gpu_test,
|
multi_gpu_test,
|
||||||
)
|
)
|
||||||
from .backend import TestBackend
|
from ..backend import TestBackend
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
@ -33,8 +33,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.system_utils import update_environment_variables
|
from vllm.utils.system_utils import update_environment_variables
|
||||||
|
|
||||||
from ..utils import has_module_attribute, multi_gpu_test
|
from ...utils import has_module_attribute, multi_gpu_test
|
||||||
from .backend import TestBackend
|
from ..backend import TestBackend
|
||||||
|
|
||||||
|
|
||||||
class TestAllReduceRMSNormModel(torch.nn.Module):
|
class TestAllReduceRMSNormModel(torch.nn.Module):
|
||||||
@ -18,7 +18,7 @@ from vllm.platforms import current_platform
|
|||||||
from vllm.utils.flashinfer import has_flashinfer
|
from vllm.utils.flashinfer import has_flashinfer
|
||||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
from ..utils import flat_product, multi_gpu_test
|
from ...utils import flat_product, multi_gpu_test
|
||||||
|
|
||||||
is_blackwell = lambda: current_platform.is_device_capability(100)
|
is_blackwell = lambda: current_platform.is_device_capability(100)
|
||||||
"""Are we running on Blackwell, a lot of tests depend on it"""
|
"""Are we running on Blackwell, a lot of tests depend on it"""
|
||||||
@ -32,8 +32,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.system_utils import update_environment_variables
|
from vllm.utils.system_utils import update_environment_variables
|
||||||
|
|
||||||
from ..utils import multi_gpu_test
|
from ...utils import multi_gpu_test
|
||||||
from .backend import TestBackend
|
from ..backend import TestBackend
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
prompts = [
|
prompts = [
|
||||||
0
tests/compile/fullgraph/__init__.py
Normal file
0
tests/compile/fullgraph/__init__.py
Normal file
@ -7,7 +7,7 @@ import pytest
|
|||||||
from vllm.config import CompilationMode
|
from vllm.config import CompilationMode
|
||||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||||
|
|
||||||
from ..utils import compare_all_settings
|
from ...utils import compare_all_settings
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
@ -15,7 +15,7 @@ from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassC
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
from ..utils import create_new_process_for_each_test
|
from ...utils import create_new_process_for_each_test
|
||||||
|
|
||||||
|
|
||||||
def models_list(*, all: bool = True, keywords: list[str] | None = None):
|
def models_list(*, all: bool = True, keywords: list[str] | None = None):
|
||||||
@ -95,7 +95,7 @@ def memory_plan_reuse_patched(self):
|
|||||||
# ===================================================
|
# ===================================================
|
||||||
# This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to
|
# This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to
|
||||||
# fix inductor partition + attention-nvfp4 quant fusion, tested in
|
# fix inductor partition + attention-nvfp4 quant fusion, tested in
|
||||||
# `tests/compile/test_fusions_e2e.py::test_attn_quant`.
|
# `tests/compile/distributed/test_fusions_e2e.py::test_attn_quant`.
|
||||||
# For more context, see https://github.com/pytorch/pytorch/pull/165815.
|
# For more context, see https://github.com/pytorch/pytorch/pull/165815.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user