From 006e7a34aeb3e905ca4131a3251fe079f0511e2f Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Fri, 5 Sep 2025 05:08:50 -0700 Subject: [PATCH 01/23] Adding int4 and int8 models for CPU benchmarking (#23709) Signed-off-by: Tsai, Louie --- .../tests/serving-tests-cpu-snc2.json | 420 +++++++++++- .../tests/serving-tests-cpu-snc3.json | 627 +++++++++++++++++- docs/contributing/benchmarks.md | 32 +- 3 files changed, 1066 insertions(+), 13 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json index 2d88a0b30c4f8..f758097e098e4 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -1,6 +1,6 @@ [ { - "test_name": "serving_llama8B_tp1_sharegpt", + "test_name": "serving_llama8B_bf16_tp1_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -32,7 +32,7 @@ } }, { - "test_name": "serving_llama8B_tp2_sharegpt", + "test_name": "serving_llama8B_bf16_tp2_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -64,7 +64,7 @@ } }, { - "test_name": "serving_llama8B_tp4_sharegpt", + "test_name": "serving_llama8B_bf16_tp4_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -96,7 +96,7 @@ } }, { - "test_name": "serving_llama8B_tp1_random_128_128", + "test_name": "serving_llama8B_bf16_tp1_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -131,7 +131,7 @@ } }, { - "test_name": "serving_llama8B_tp2_random_128_128", + "test_name": "serving_llama8B_bf16_tp2_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -166,7 +166,7 @@ } }, { - "test_name": "serving_llama8B_tp4_random_128_128", + "test_name": "serving_llama8B_bf16_tp4_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -198,5 +198,413 @@ "random-output-len": 128, "num_prompts": 1000 } + }, + { + "test_name": "serving_llama8B_int8_tp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } } ] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json index 823abbaa99f86..ce396d6e54f27 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -1,6 +1,6 @@ [ { - "test_name": "serving_llama8B_pp1_sharegpt", + "test_name": "serving_llama8B_bf16_pp1_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -32,7 +32,39 @@ } }, { - "test_name": "serving_llama8B_pp3_sharegpt", + "test_name": "serving_llama8B_bf16_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_bf16_pp3_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -64,7 +96,7 @@ } }, { - "test_name": "serving_llama8B_tp2pp3_sharegpt", + "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -97,7 +129,7 @@ } }, { - "test_name": "serving_llama8B_pp1_random_128_128", + "test_name": "serving_llama8B_bf16_pp1_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -132,7 +164,42 @@ } }, { - "test_name": "serving_llama8B_pp3_random_128_128", + "test_name": "serving_llama8B_bf16_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_bf16_pp3_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -167,7 +234,7 @@ } }, { - "test_name": "serving_llama8B_tp2pp3_random_128_128", + "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -201,5 +268,553 @@ "ignore-eos": "", "num_prompts": 1000 } + }, + { + "test_name": "serving_llama8B_int8_pp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_pp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_pp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_pp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } } ] diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 2bbed778f3c6a..25c2d2955ff2f 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -11,9 +11,39 @@ vLLM contains two sets of benchmarks: The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. +### Manually Trigger the benchmark + +Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite. +For CPU environment, please use the image with "-cpu" postfix. + +Here is an example for docker run command for CPU. + +```bash +docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu +``` + +Then, run below command inside the docker instance. + +```bash +bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +``` + +When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json. + +#### Runtime environment variables + +- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. +- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). +- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). +- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). +- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. +- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. + +For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results). + The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). -More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). +More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). [](){ #nightly-benchmarks } From 7812bcf2783acef15b6088ba223f1c94fec42d0d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 5 Sep 2025 22:48:42 +0800 Subject: [PATCH 02/23] [docs] add shenzhen meetup (#24326) Signed-off-by: youkaichao --- README.md | 5 +++-- docs/community/meetups.md | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e13993efd3702..4e03df758c261 100644 --- a/README.md +++ b/README.md @@ -18,16 +18,17 @@ Easy, fast, and cheap LLM serving for everyone *Latest News* 🔥 +- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA). - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing). - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH). -- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). -- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152). - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/). - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
Previous News +- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). +- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152). - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing). - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing). diff --git a/docs/community/meetups.md b/docs/community/meetups.md index 04919769e173f..a3004249b758b 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -2,6 +2,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA) - [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing) - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH) - [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). From 23a6c5280e93ca8796f12b18d0e1dba4f3d1331d Mon Sep 17 00:00:00 2001 From: Chauncey Date: Sat, 6 Sep 2025 01:26:00 +0800 Subject: [PATCH 03/23] [gpt-oss][Bugfix]Fix streamableparser for missing handling of certain token_ids (#24306) Signed-off-by: chaunceyjiang --- vllm/entrypoints/context.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index fb58cba3a40ff..e4f2e800f94a6 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -238,11 +238,11 @@ class StreamingHarmonyContext(HarmonyContext): # (finished=True), then the next token processed will mark the # beginning of a new message self.first_tok_of_message = output.finished - tok = output.outputs[0].token_ids[0] - self.parser.process(tok) + for tok in output.outputs[0].token_ids: + self.parser.process(tok) self._update_num_output_tokens(output.outputs[0].token_ids) # Check if the current token is part of reasoning content - self._update_num_reasoning_tokens([tok]) + self._update_num_reasoning_tokens(output.outputs[0].token_ids) self.last_tok = tok else: # Handle the case of tool output in direct message format From eedb2a2a102b47d527071b9acfb9edd541ef5de6 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Sat, 6 Sep 2025 04:13:42 +0800 Subject: [PATCH 04/23] [Bugfix] Fix silu_mul+quant fusion test (#24341) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- tests/compile/test_silu_mul_quant_fusion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index e16d1725e6add..731ceeb905f64 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -118,7 +118,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, fusion_pass = ActivationQuantFusionPass(config) backend = TestBackend(NoOpEliminationPass(config), fusion_pass) - model = model_class(hidden_size, cuda_force_torch) + model = model_class(hidden_size=hidden_size, + cuda_force_torch=cuda_force_torch) # First dimension dynamic x = torch.rand(num_tokens, hidden_size * 2) From 9dfbeb41e510ad04e90b03ef2f437f476c5abe28 Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Fri, 5 Sep 2025 14:14:18 -0700 Subject: [PATCH 05/23] [RFC] allow cancelation after shutdown in blocking collective_rpc (#23390) Signed-off-by: Shiyan Deng --- vllm/v1/executor/multiproc_executor.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 12e79ff165f4e..84eb956b5c254 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -253,7 +253,8 @@ class MultiprocExecutor(Executor): if not non_block: result = result.result() elif not non_block: - result = get_response(w, dequeue_timeout) + result = get_response(w, dequeue_timeout, + self.shutdown_event) else: raise RuntimeError("non_block can only be used when" " max_concurrent_batches > 1") @@ -295,12 +296,8 @@ class MultiprocExecutor(Executor): """Properly shut down the executor and its workers""" if not getattr(self, 'shutting_down', False): self.shutting_down = True - self.shutdown_event.set() - - if self.io_thread_pool is not None: - self.io_thread_pool.shutdown(wait=False, cancel_futures=True) - self.io_thread_pool = None + # Make sure all the worker processes are terminated first. if workers := getattr(self, 'workers', None): for w in workers: # Close death_writer to signal child processes to exit @@ -310,6 +307,11 @@ class MultiprocExecutor(Executor): w.worker_response_mq = None self._ensure_worker_termination([w.proc for w in workers]) + self.shutdown_event.set() + if self.io_thread_pool is not None: + self.io_thread_pool.shutdown(wait=False, cancel_futures=True) + del self.io_thread_pool + self.rpc_broadcast_mq = None def check_health(self) -> None: From c954c6629cdf4ebbfc1cab05d7d71af6696c5881 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Fri, 5 Sep 2025 20:26:22 -0400 Subject: [PATCH 06/23] [CI] Add timeouts to tests (#24260) Signed-off-by: Rafael Vasquez Signed-off-by: Nick Hill Co-authored-by: Nick Hill --- .buildkite/test-pipeline.yaml | 146 ++++++++++++++++++++++++---------- 1 file changed, 102 insertions(+), 44 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 55349e0ac9321..ad240023a0030 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -41,7 +41,8 @@ steps: commands: - bash standalone_tests/pytorch_nightly_dependency.sh -- label: Async Engine, Inputs, Utils, Worker Test # 24min +- label: Async Engine, Inputs, Utils, Worker Test # 36min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -63,7 +64,8 @@ steps: - pytest -v -s utils_ # Utils - pytest -v -s worker # Worker -- label: Python-only Installation Test +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - tests/standalone_tests/python_only_compile.sh @@ -71,7 +73,8 @@ steps: commands: - bash standalone_tests/python_only_compile.sh -- label: Basic Correctness Test # 30min +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] fast_check: true torch_nightly: true @@ -88,7 +91,8 @@ steps: - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py -- label: Core Test # 10min +- label: Core Test # 22min + timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] fast_check: true source_file_dependencies: @@ -98,7 +102,8 @@ steps: commands: - pytest -v -s core -- label: Entrypoints Test (LLM) # 40min +- label: Entrypoints Test (LLM) # 30min + timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true @@ -114,7 +119,8 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Test (API Server) # 40min +- label: Entrypoints Test (API Server) # 100min + timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true @@ -129,7 +135,8 @@ steps: - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py - pytest -v -s entrypoints/test_chat_utils.py -- label: Distributed Tests (4 GPUs) # 10min +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -172,7 +179,8 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd -- label: EPLB Algorithm Test +- label: EPLB Algorithm Test # 5min + timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/distributed/eplb @@ -181,6 +189,7 @@ steps: - pytest -v -s distributed/test_eplb_algo.py - label: EPLB Execution Test # 5min + timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -189,7 +198,8 @@ steps: commands: - pytest -v -s distributed/test_eplb_execute.py -- label: Metrics, Tracing Test # 10min +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] num_gpus: 2 source_file_dependencies: @@ -208,7 +218,8 @@ steps: ##### fast check tests ##### ##### 1 GPU test ##### -- label: Regression Test # 5min +- label: Regression Test # 7min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -218,7 +229,8 @@ steps: - pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional -- label: Engine Test # 10min +- label: Engine Test # 25min + timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -233,7 +245,8 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization -- label: V1 Test e2e + engine +- label: V1 Test e2e + engine # 30min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -244,7 +257,8 @@ steps: - pytest -v -s v1/e2e - pytest -v -s v1/engine -- label: V1 Test entrypoints +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -252,7 +266,8 @@ steps: commands: - pytest -v -s v1/entrypoints -- label: V1 Test others +- label: V1 Test others # 42min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -276,7 +291,8 @@ steps: - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine -- label: Examples Test # 25min +- label: Examples Test # 30min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/examples" source_file_dependencies: @@ -301,7 +317,8 @@ steps: - python3 offline_inference/basic/score.py - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 -- label: Platform Tests (CUDA) +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -309,7 +326,8 @@ steps: commands: - pytest -v -s cuda/test_cuda_context.py -- label: Samplers Test # 36min +- label: Samplers Test # 56min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/layers @@ -320,15 +338,23 @@ steps: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers -- label: LoRA Test %N # 15min each +- label: LoRA Test %N # 20min each + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py parallelism: 4 -- label: PyTorch Compilation Unit Tests +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -344,7 +370,8 @@ steps: - pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_decorator.py -- label: PyTorch Fullgraph Smoke Test # 9min +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -358,7 +385,8 @@ steps: - pytest -v -s compile/piecewise/test_full_cudagraph.py - pytest -v -s compile/piecewise/test_multiple_graphs.py -- label: PyTorch Fullgraph Test # 18min +- label: PyTorch Fullgraph Test # 20min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -367,7 +395,8 @@ steps: commands: - pytest -v -s compile/test_full_graph.py -- label: Kernels Core Operation Test +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -375,7 +404,8 @@ steps: commands: - pytest -v -s kernels/core -- label: Kernels Attention Test %N +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/attention/ @@ -386,7 +416,8 @@ steps: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 -- label: Kernels Quantization Test %N +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/quantization/ @@ -396,7 +427,8 @@ steps: - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 -- label: Kernels MoE Test %N +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/quantization/cutlass_w8a8/moe/ @@ -408,7 +440,8 @@ steps: - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 -- label: Kernels Mamba Test +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/mamba/ @@ -416,7 +449,8 @@ steps: commands: - pytest -v -s kernels/mamba -- label: Tensorizer Test # 11min +- label: Tensorizer Test # 14min + timeout_in_minutes: 25 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/model_loader @@ -428,7 +462,8 @@ steps: - pytest -v -s tensorizer_loader - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py -- label: Model Executor Test +- label: Model Executor Test # 7min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor @@ -438,7 +473,8 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s model_executor -- label: Benchmarks # 9min +- label: Benchmarks # 11min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/.buildkite" source_file_dependencies: @@ -446,7 +482,8 @@ steps: commands: - bash scripts/run-benchmarks.sh -- label: Benchmarks CLI Test # 10min +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -454,7 +491,8 @@ steps: commands: - pytest -v -s benchmarks/ -- label: Quantization Test +- label: Quantization Test # 70min + timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -467,6 +505,7 @@ steps: - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - label: LM Eval Small Models # 53min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -474,7 +513,8 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 -- label: OpenAI API correctness +- label: OpenAI API correctness # 22min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -483,7 +523,8 @@ steps: commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ -- label: Encoder Decoder tests # 5min +- label: Encoder Decoder tests # 12min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -491,7 +532,8 @@ steps: commands: - pytest -v -s encoder_decoder -- label: OpenAI-Compatible Tool Use # 20 min +- label: OpenAI-Compatible Tool Use # 23 min + timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] fast_check: false source_file_dependencies: @@ -504,7 +546,8 @@ steps: ##### models test ##### -- label: Basic Models Test # 24min +- label: Basic Models Test # 57min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -517,7 +560,8 @@ steps: - pytest -v -s models/test_vision.py - pytest -v -s models/test_initialization.py -- label: Language Models Test (Standard) +- label: Language Models Test (Standard) # 35min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -528,6 +572,7 @@ steps: - pytest -v -s models/language -m core_model - label: Language Models Test (Hybrid) # 35 min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -540,7 +585,8 @@ steps: - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m hybrid_model -- label: Language Models Test (Extended Generation) # 1hr20min +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: @@ -552,6 +598,7 @@ steps: - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: @@ -560,7 +607,8 @@ steps: commands: - pytest -v -s models/language/pooling -m 'not core_model' -- label: Multi-Modal Processor Test +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 source_file_dependencies: - vllm/ - tests/models/multimodal @@ -568,7 +616,8 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/processing -- label: Multi-Modal Models Test (Standard) +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 80 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -610,7 +659,8 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' -- label: Quantized Models Test +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/layers/quantization @@ -640,7 +690,8 @@ steps: - python3 examples/offline_inference/audio_language.py --model-type whisper - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl -- label: Blackwell Test +- label: Blackwell Test # 38 min + timeout_in_minutes: 60 working_dir: "/vllm-workspace/" gpu: b200 # optional: true @@ -682,6 +733,7 @@ steps: ##### multi gpus test ##### - label: Distributed Comm Ops Test # 7min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -693,6 +745,7 @@ steps: - pytest -v -s distributed/test_shm_broadcast.py - label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -716,7 +769,8 @@ steps: - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code -- label: Distributed Tests (2 GPUs) # 40min +- label: Distributed Tests (2 GPUs) # 110min + timeout_in_minutes: 150 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -757,6 +811,7 @@ steps: - pytest -v -s models/multimodal/generation/test_maverick.py - label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -783,6 +838,7 @@ steps: - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - label: Pipeline Parallelism Test # 45min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -796,7 +852,8 @@ steps: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py -- label: LoRA TP Test (Distributed) +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] num_gpus: 4 source_file_dependencies: @@ -814,6 +871,7 @@ steps: - label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 From cee182b2970b56d3f9d5aa0017ce165cbdb788f6 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Fri, 5 Sep 2025 21:20:17 -0400 Subject: [PATCH 07/23] [Perf][V1] Fully overlap model execution (#23569) Signed-off-by: Benjamin Chislett --- vllm/v1/executor/multiproc_executor.py | 50 +++++- vllm/v1/outputs.py | 15 ++ vllm/v1/worker/gpu_input_batch.py | 5 + vllm/v1/worker/gpu_model_runner.py | 203 ++++++++++++++++++++++--- vllm/v1/worker/gpu_worker.py | 10 +- 5 files changed, 252 insertions(+), 31 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 84eb956b5c254..ef6303495c245 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -3,6 +3,7 @@ import multiprocessing import os import pickle +import queue import signal import threading import time @@ -33,7 +34,8 @@ from vllm.utils import (decorate_logs, get_distributed_init_method, get_loopback_ip, get_mp_context, get_open_port, set_process_title) from vllm.v1.executor.abstract import Executor, FailureCallback -from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput +from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds, + ModelRunnerOutput) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -414,6 +416,16 @@ class WorkerProc: # Initializes a message queue for sending the model output self.worker_response_mq = MessageQueue(1, 1) + scheduler_config = vllm_config.scheduler_config + self.use_async_scheduling = scheduler_config.async_scheduling + if self.use_async_scheduling: + self.async_output_queue: queue.Queue = queue.Queue() + self.async_output_copy_thread = Thread( + target=self.async_output_busy_loop, + daemon=True, + name="WorkerAsyncOutputCopy") + self.async_output_copy_thread.start() + # Initialize device and loads weights self.worker.init_device() self.worker.load_model() @@ -595,6 +607,36 @@ class WorkerProc: SUCCESS = auto() FAILURE = auto() + def enqueue_output(self, output: Any): + """Prepares output from the worker and enqueues it to the + worker_response_mq. If the output is an Exception, it is + converted to a FAILURE response. + """ + if isinstance(output, AsyncModelRunnerOutput): + output = output.get_output() + + if isinstance(output, Exception): + result = (WorkerProc.ResponseStatus.FAILURE, str(output)) + else: + result = (WorkerProc.ResponseStatus.SUCCESS, output) + self.worker_response_mq.enqueue(result) + + def handle_output(self, output: Any): + """Handles output from the worker. If async scheduling is enabled, + it is passed to the async_output_busy_loop thread. Otherwise, it is + enqueued directly to the worker_response_mq. + """ + if self.use_async_scheduling: + self.async_output_queue.put(output) + else: + self.enqueue_output(output) + + def async_output_busy_loop(self): + """Entrypoint for the thread which handles outputs asynchronously.""" + while True: + output = self.async_output_queue.get() + self.enqueue_output(output) + def worker_busy_loop(self): """Main busy loop for Multiprocessing Workers""" while True: @@ -614,10 +656,8 @@ class WorkerProc: # exception might not be serializable, so we convert it to # string, only for logging purpose. if output_rank is None or self.rank == output_rank: - self.worker_response_mq.enqueue( - (WorkerProc.ResponseStatus.FAILURE, str(e))) + self.handle_output(e) continue if output_rank is None or self.rank == output_rank: - self.worker_response_mq.enqueue( - (WorkerProc.ResponseStatus.SUCCESS, output)) + self.handle_output(output) diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index f8d6b24702f3c..1b2da8addb19e 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod from dataclasses import dataclass from typing import NamedTuple, Optional @@ -114,6 +115,20 @@ class ModelRunnerOutput: num_nans_in_logits: Optional[dict[str, int]] = None +# ModelRunnerOutput wrapper for async scheduling. +class AsyncModelRunnerOutput(ABC): + + @abstractmethod + def get_output(self) -> ModelRunnerOutput: + """Get the ModelRunnerOutput for this async output. + + This is a blocking call that waits until the results are ready, which + might involve copying device tensors to the host. + This method should only be called once per AsyncModelRunnerOutput. + """ + pass + + @dataclass class DraftTokenIds: diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index ad70d9efaaaac..83fc821b84946 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -250,6 +250,11 @@ class InputBatch: self.pooling_params: dict[str, PoolingParams] = {} + # Cached reference to the GPU tensor of previously sampled tokens + self.prev_sampled_token_ids: Optional[torch.Tensor] = None + self.prev_sampled_token_ids_invalid_indices: Optional[set[int]] = None + self.prev_req_id_to_index: Optional[dict[str, int]] = None + @property def req_ids(self) -> list[str]: # None elements should only be present transiently diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 42baf020e9dc6..7859e966b04f2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -67,8 +67,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, MambaSpec, SlidingWindowSpec) -from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds, - LogprobsTensors, ModelRunnerOutput) +from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, + DraftTokenIds, LogprobsTensors, ModelRunnerOutput) from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs from vllm.v1.sample.metadata import SamplingMetadata @@ -100,6 +100,53 @@ else: logger = init_logger(__name__) +# Wrapper for ModelRunnerOutput to support overlapped execution. +class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): + + def __init__( + self, + model_runner_output: ModelRunnerOutput, + sampled_token_ids: torch.Tensor, + invalid_req_indices: list[int], + async_output_copy_stream: torch.cuda.Stream, + ): + self._model_runner_output = model_runner_output + self._invalid_req_indices = invalid_req_indices + + # Event on the copy stream so we can synchronize the non-blocking copy. + self._async_copy_ready_event = torch.cuda.Event() + + # Keep a reference to the device tensor to avoid it being + # deallocated until we finish copying it to the host. + self._sampled_token_ids = sampled_token_ids + + # Initiate the copy on a separate stream, but do not synchronize it. + default_stream = torch.cuda.current_stream() + with torch.cuda.stream(async_output_copy_stream): + async_output_copy_stream.wait_stream(default_stream) + self._sampled_token_ids_cpu = self._sampled_token_ids.to( + 'cpu', non_blocking=True) + self._async_copy_ready_event.record() + + def get_output(self) -> ModelRunnerOutput: + """Copy the device tensors to the host and return a ModelRunnerOutput. + + This function blocks until the copy is finished. + """ + self._async_copy_ready_event.synchronize() + + # Release the device tensor once the copy has completed + del self._sampled_token_ids + + valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist() + for i in self._invalid_req_indices: + valid_sampled_token_ids[i].clear() + + output = self._model_runner_output + output.sampled_token_ids = valid_sampled_token_ids + return output + + class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def __init__( @@ -230,6 +277,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): is_pooling_model=self.is_pooling_model, ) + self.use_async_scheduling = self.scheduler_config.async_scheduling + self.async_output_copy_stream = torch.cuda.Stream() if \ + self.use_async_scheduling else None + # TODO(woosuk): Provide an option to tune the max cudagraph batch size. # The convention is different. # self.cudagraph_batch_sizes sorts in ascending order. @@ -654,6 +705,73 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return cu_num_tokens, arange + def _prepare_input_ids(self, total_num_scheduled_tokens: int, + cu_num_tokens: np.ndarray) -> None: + """Prepare the input IDs for the current batch. + + Carefully handles the `prev_sampled_token_ids` which can be cached + from the previous engine iteration, in which case those tokens on the + GPU need to be copied into the corresponding slots into input_ids.""" + + if self.input_batch.prev_sampled_token_ids is None: + # Normal scheduling case + self.input_ids.copy_to_gpu(total_num_scheduled_tokens) + return + + # Async scheduling case, where some decode requests from the previous + # iteration won't have entries in input_ids_cpu and need to be copied + # on the GPU from prev_sampled_token_ids. + prev_req_id_to_index = self.input_batch.prev_req_id_to_index + assert prev_req_id_to_index is not None + flattened_indices = [] + prev_common_req_indices = [] + indices_match = True + max_flattened_index = -1 + for req_id, cur_index in self.input_batch.req_id_to_index.items(): + if (prev_index := prev_req_id_to_index.get(req_id)) is not None: + prev_common_req_indices.append(prev_index) + # We need to compute the flattened input_ids index of the + # last token in each common request. + flattened_index = cu_num_tokens[cur_index].item() - 1 + flattened_indices.append(flattened_index) + indices_match &= (prev_index == flattened_index) + max_flattened_index = max(max_flattened_index, flattened_index) + num_commmon_tokens = len(flattened_indices) + if num_commmon_tokens < total_num_scheduled_tokens: + # If not all requests are decodes from the last iteration, + # We need to copy the input_ids_cpu to the GPU first. + self.input_ids.copy_to_gpu(total_num_scheduled_tokens) + if num_commmon_tokens == 0: + # No requests in common with the previous iteration + # So input_ids_cpu will have all the input ids. + return + if indices_match and max_flattened_index == (num_commmon_tokens - 1): + # Common-case optimization: the batch is unchanged + # and no reordering happened. + # The indices are both the same permutation of 0..N-1 so + # we can copy directly using a single slice. + self.input_ids.gpu[:num_commmon_tokens].copy_( + self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, + 0], + non_blocking=True) + return + # Upload the index tensors asynchronously + # so the scatter can be non-blocking. + input_ids_index_tensor = torch.tensor(flattened_indices, + dtype=torch.int64, + pin_memory=self.pin_memory).to( + self.device, + non_blocking=True) + prev_common_req_indices_tensor = torch.tensor( + prev_common_req_indices, + dtype=torch.int64, + pin_memory=self.pin_memory).to(self.device, non_blocking=True) + self.input_ids.gpu.scatter_( + dim=0, + index=input_ids_index_tensor, + src=self.input_batch.prev_sampled_token_ids[ + prev_common_req_indices_tensor, 0]) + def _prepare_inputs( self, scheduler_output: "SchedulerOutput", @@ -740,7 +858,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): max_seq_len = self.seq_lens.np[:num_reqs].max().item() # Copy the tensors to the GPU. - self.input_ids.copy_to_gpu(total_num_scheduled_tokens) + self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens) + if self.uses_mrope: # Only relevant for models using M-RoPE (e.g, Qwen2-VL) self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( @@ -1458,7 +1577,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self, scheduler_output: "SchedulerOutput", intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> Union[ModelRunnerOutput, IntermediateTensors]: + ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]: self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: if not has_kv_transfer_group(): @@ -1673,6 +1792,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # so that we could clear the sampled tokens before returning. discard_sampled_tokens_req_indices.append(i) + # Copy some objects so they don't get modified after returning. + # This is important when using async scheduling. + req_ids_output_copy = self.input_batch.req_ids.copy() + req_id_to_index_output_copy = \ + self.input_batch.req_id_to_index.copy() + # NOTE: GPU -> CPU Sync happens here. # Move as many CPU operations as possible before this sync point. logprobs_tensors = sampler_output.logprobs_tensors @@ -1685,21 +1810,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): scheduler_output.num_scheduled_tokens, ) - # Get the valid generated tokens. + num_sampled_tokens = sampler_output.sampled_token_ids.shape[0] sampled_token_ids = sampler_output.sampled_token_ids - max_gen_len = sampled_token_ids.shape[-1] - if max_gen_len == 1: - # No spec decode tokens. - valid_sampled_token_ids = self._to_list(sampled_token_ids) + if not self.use_async_scheduling: + # Get the valid generated tokens. + max_gen_len = sampled_token_ids.shape[-1] + if max_gen_len == 1: + # No spec decode tokens. + valid_sampled_token_ids = self._to_list(sampled_token_ids) + else: + # Includes spec decode tokens. + valid_sampled_token_ids = self.rejection_sampler.parse_output( + sampled_token_ids, + self.input_batch.vocab_size, + ) + # Mask out the sampled tokens that should not be sampled. + for i in discard_sampled_tokens_req_indices: + valid_sampled_token_ids[i].clear() else: - # Includes spec decode tokens. - valid_sampled_token_ids = self.rejection_sampler.parse_output( - sampled_token_ids, - self.input_batch.vocab_size, - ) - # Mask out the sampled tokens that should not be sampled. - for i in discard_sampled_tokens_req_indices: - valid_sampled_token_ids[i].clear() + valid_sampled_token_ids = [] + invalid_req_indices = list(discard_sampled_tokens_req_indices) + invalid_req_indices_set = set(invalid_req_indices) + assert sampled_token_ids.shape[-1] == 1 + + # Cache the sampled tokens on the GPU and avoid CPU sync. + # These will be copied into input_ids in the next step + # when preparing inputs. + self.input_batch.prev_sampled_token_ids = \ + sampled_token_ids + self.input_batch.prev_sampled_token_ids_invalid_indices = \ + invalid_req_indices_set + self.input_batch.prev_req_id_to_index = { + req_id: i + for i, req_id in enumerate(self.input_batch.req_ids) + if i not in invalid_req_indices_set + } # Cache the sampled tokens in the model runner, so that the scheduler # doesn't need to send them back. @@ -1707,7 +1852,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # the sampled tokens back, because there's no direct communication # between the first-stage worker and the last-stage worker. req_ids = self.input_batch.req_ids - for req_idx, sampled_ids in enumerate(valid_sampled_token_ids): + for req_idx in range(num_sampled_tokens): + if self.use_async_scheduling: + sampled_ids = [-1] if \ + req_idx not in invalid_req_indices_set else None + else: + sampled_ids = valid_sampled_token_ids[req_idx] if not sampled_ids: continue @@ -1722,6 +1872,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): start_idx:end_idx] = sampled_ids self.input_batch.num_tokens_no_spec[req_idx] = end_idx self.input_batch.num_tokens[req_idx] = end_idx + req_id = req_ids[req_idx] req_state = self.requests[req_id] req_state.output_token_ids.extend(sampled_ids) @@ -1741,9 +1892,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.eplb_step() - return ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, + output = ModelRunnerOutput( + req_ids=req_ids_output_copy, + req_id_to_index=req_id_to_index_output_copy, sampled_token_ids=valid_sampled_token_ids, logprobs=logprobs_lists, prompt_logprobs_dict=prompt_logprobs_dict, @@ -1752,6 +1903,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_nans_in_logits=num_nans_in_logits, ) + if not self.use_async_scheduling: + return output + + return AsyncGPUModelRunnerOutput( + model_runner_output=output, + sampled_token_ids=sampled_token_ids, + invalid_req_indices=invalid_req_indices, + async_output_copy_stream=self.async_output_copy_stream, + ) + def take_draft_token_ids(self) -> Optional[DraftTokenIds]: if self._draft_token_ids is None: return None diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index affba877ecf92..99c805a3e9496 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -5,7 +5,7 @@ import copy import gc import os from contextlib import AbstractContextManager, nullcontext -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Optional, Union import torch import torch.distributed @@ -28,8 +28,8 @@ from vllm.tasks import SupportedTask from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec -from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds, - ModelRunnerOutput) +from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, + DraftTokenIds, ModelRunnerOutput) from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.worker_base import WorkerBase @@ -355,7 +355,7 @@ class Worker(WorkerBase): def execute_model( self, scheduler_output: "SchedulerOutput", - ) -> Optional[ModelRunnerOutput]: + ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]: intermediate_tensors = None forward_pass = scheduler_output.total_num_scheduled_tokens > 0 if forward_pass and not get_pp_group().is_first_rank: @@ -365,7 +365,7 @@ class Worker(WorkerBase): output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) - if isinstance(output, ModelRunnerOutput): + if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)): return output assert isinstance(output, IntermediateTensors) From 35efa7029702f47a427bd78407647e2b05929bbb Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Fri, 5 Sep 2025 18:56:15 -0700 Subject: [PATCH 08/23] Add @22quinn as code reviewer for RL related components (#24346) --- .github/CODEOWNERS | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c087fd555c661..d2839deccbf2a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,13 +5,15 @@ /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn +/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/mamba @tdoublep +/vllm/model_executor/model_loader @22quinn /vllm/multimodal @DarkLight1337 @ywang96 +/vllm/v1/sample @22quinn @houseroad /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee /vllm/reasoning @aarnphm @@ -85,4 +87,3 @@ mkdocs.yaml @hmellor /vllm/v1/attention/backends/mla/rocm*.py @gshtras /vllm/attention/ops/rocm*.py @gshtras /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras - From 35bf19386489c204189fe920aa6e1b420fe75928 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Sat, 6 Sep 2025 04:41:12 +0200 Subject: [PATCH 09/23] [Doc]: fix typos in Python comments (#24294) Signed-off-by: Didier Durand Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- csrc/quantization/machete/generate.py | 2 +- docs/getting_started/installation/cpu.md | 2 +- tests/models/multimodal/generation/vlm_utils/core.py | 2 +- vllm/distributed/device_communicators/custom_all_reduce.py | 4 ++-- .../openai/tool_parsers/internlm2_tool_parser.py | 6 +++--- vllm/envs.py | 2 +- vllm/model_executor/layers/fused_moe/fused_moe.py | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 4 ++-- vllm/model_executor/layers/quantization/gptq_marlin.py | 4 ++-- vllm/v1/attention/backends/flashinfer.py | 2 +- vllm/v1/engine/core.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 2 +- 12 files changed, 17 insertions(+), 17 deletions(-) diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 0d14ba15937c6..8fd536ef46e3d 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8): )) def prepacked_type_key(prepack_type: PrepackTypeConfig): - # For now we we can just use the first accumulator type seen since + # For now, we can just use the first accumulator type seen since # the tensor core shapes/layouts don't vary based on accumulator # type so we can generate less code this way return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert) diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index ccb2909ea3fb6..f8b4f75308df7 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch - Offline Inference: `256 * world_size` - Online Serving: `128 * world_size` -vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes. +vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes. ### Which quantization configs does vLLM CPU support? diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index ae70838336957..11d44120b875f 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -42,7 +42,7 @@ def run_test( tensor_parallel_size: int = 1, vllm_embeddings: Optional[torch.Tensor] = None, ): - """Modality agnostic test test executor for comparing HF/vLLM outputs.""" + """Modality agnostic test executor for comparing HF/vLLM outputs.""" # In the case of embeddings, vLLM takes separate input tensors vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 80aca81234eb0..c8cc35f99785c 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -60,7 +60,7 @@ class CustomAllreduce: group: the process group to work on. If None, it will use the default process group. device: the device to bind the CustomAllreduce to. If None, - it will be bind to f"cuda:{local_rank}". + it will be bound to f"cuda:{local_rank}". It is the caller's responsibility to make sure each communicator is bind to a unique device, and all communicators in this group are in the same node. @@ -158,7 +158,7 @@ class CustomAllreduce: self.disabled = False # Buffers memory are owned by this Python class and passed to C++. - # Meta data composes of two parts: meta data for synchronization and a + # Metadata composes of two parts: metadata for synchronization and a # temporary buffer for storing intermediate allreduce results. self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size, group=group, diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 6ef8fadf59ac5..2055393d7ec71 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser): self, request: ChatCompletionRequest) -> ChatCompletionRequest: if request.tools and request.tool_choice != 'none': # do not skip special tokens because internlm use the special - # tokens to indicated the start and end of the tool calls + # tokens to indicate the start and end of the tool calls # information. request.skip_special_tokens = False return request @@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser): if '<|action_start|>' not in current_text: self.position = len(current_text) return DeltaMessage(content=delta_text) - # if the tool call is sended, return a empty delta message - # to make sure the finish_reason will be send correctly. + # if the tool call is sended, return an empty delta message + # to make sure the finish_reason will be sent correctly. if self.current_tool_id > 0: return DeltaMessage(content='') diff --git a/vllm/envs.py b/vllm/envs.py index 56adb83e8de15..e5b44893297e2 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1064,7 +1064,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # vllm should use flashinfer fused allreduce. The variable should be a # JSON with the following format: # { : } - # Unspecified world sizes will fallback to + # Unspecified world sizes will fall back to # { 2: 64, 4: 1, : 0.5 } "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB": lambda: json.loads(os.getenv( diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index eb3e14180ecfe..06edfb0552e84 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, EM = sorted_token_ids.size(0) if A.size(0) < config["BLOCK_SIZE_M"]: # optimize for small batch_size. - # We assume that top_ids of each token is unique, so + # We assume that top_ids of each token is unique, # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, # and we can skip some invalid blocks. EM = min(sorted_token_ids.size(0), diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b1a61ade53649..272ad39565375 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -710,7 +710,7 @@ def determine_expert_map( # Create a tensor of size num_experts filled with -1 expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32) - # Create a expert map for the local experts + # Create an expert map for the local experts start_idx = ep_rank * base_experts + min(ep_rank, remainder) expert_map[start_idx:start_idx + local_num_experts] = torch.arange( 0, local_num_experts, dtype=torch.int32) @@ -806,7 +806,7 @@ class FusedMoE(CustomOp): self.global_num_experts = num_experts + num_redundant_experts - # we padding globally so EP buffer allocation works + # we are padding globally so EP buffer allocation works if quant_config and quant_config.get_name() == "mxfp4": from vllm.model_executor.layers.quantization.mxfp4 import ( # noqa: E501 should_use_flashinfer_mxfp4) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index cf959e13bc45c..76de3a59c8ca1 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -469,7 +469,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): ) layer.register_parameter("w2_scales", w2_scales) set_weight_attrs(w2_scales, extra_weight_attrs) - # dont shard the w2 scales when running act order + # don't shard the w2 scales when running act order set_weight_attrs(w2_scales, {"load_full_w2": self.quant_config.desc_act}) # up_proj scales @@ -493,7 +493,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): ) layer.register_parameter("w2_qzeros", w2_qzeros) set_weight_attrs(w2_qzeros, extra_weight_attrs) - # dont shard the w2 scales when running act order + # don't shard the w2 scales when running act order set_weight_attrs(w2_qzeros, {"load_full_w2": self.quant_config.desc_act}) w13_g_idx = torch.nn.Parameter( diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index fc1738579787a..06a853007a578 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -687,7 +687,7 @@ class FlashInferImpl(AttentionImpl): else: raise ValueError(f"Unsupported output dtype: {output.dtype}") - # TRTLLM attn kernel requires o scale to pass as a host scalar, + # TRTLLM attn kernel requires to scale to pass as a host scalar, # store the o scale as a host scalar in warmup run with cuda graph # not enabled if layer._o_scale_float is None: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index d7e9cfa3660b1..e239e6cbba167 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -439,7 +439,7 @@ class EngineCore: """ # Note on thread safety: no race condition. # `mm_receiver_cache` is reset at the end of LLMEngine init, - # and will only accessed in the input processing thread afterwards. + # and will only be accessed in the input processing thread afterwards. if self.mm_receiver_cache is not None and request.mm_features: request.mm_features = ( self.mm_receiver_cache.get_and_update_features( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7859e966b04f2..5bee2dff98329 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2826,7 +2826,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Disable cudagraph capturing globally, so any unexpected cudagraph # capturing will be detected and raise an error after here. # Note: We don't put it into graph_capture context manager because - # we may doing lazy capturing in future that still allows capturing + # we may do lazy capturing in future that still allows capturing # after here. set_cudagraph_capturing_enabled(False) From 3c529fc9945964819dc17b9910ad6ccdbf231413 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Fri, 5 Sep 2025 20:22:40 -0700 Subject: [PATCH 10/23] [KV Sharing] Raise error if using eagle with fast prefill (#24350) Signed-off-by: Yong Hoon Shin --- vllm/config/__init__.py | 18 ++++++++++++++++++ vllm/config/cache.py | 7 ------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 941aff8919a92..8bdc22acf380e 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3665,6 +3665,24 @@ class VllmConfig: " Disabling `torch.compile`.") self.compilation_config.level = CompilationLevel.NO_COMPILATION + if self.cache_config.kv_sharing_fast_prefill: + if not envs.VLLM_USE_V1: + raise NotImplementedError( + "Fast prefill optimization for KV sharing is not supported " + "in V0 currently.") + + if self.speculative_config is not None and \ + self.speculative_config.use_eagle(): + raise NotImplementedError( + "Fast prefill optimization for KV sharing is not " + "compatible with EAGLE as EAGLE requires correct logits " + "for all tokens while fast prefill gives incorrect logits " + "for prompt tokens.") + + logger.warning_once( + "--kv-sharing-fast-prefill requires changes on model side for " + "correctness and to realize prefill savings. ") + if ((not envs.VLLM_USE_V1) and self.lora_config is not None and self.compilation_config.level != CompilationLevel.NO_COMPILATION): diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 79761e7844859..6f8f962fe7cad 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -145,19 +145,12 @@ class CacheConfig: self._verify_cache_dtype() self._verify_prefix_caching() - self._verify_kv_sharing_fast_prefill() def metrics_info(self): # convert cache_config to dict(key: str, value: str) for prometheus # metrics info return {key: str(value) for key, value in self.__dict__.items()} - def _verify_kv_sharing_fast_prefill(self) -> None: - if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1: - raise NotImplementedError( - "Fast prefill optimization for KV sharing is not supported " - "in V0 currently.") - @model_validator(mode='after') def _verify_args(self) -> Self: if self.cpu_offload_gb < 0: From ac201a0eaf2af032779db3e7ac96f857cbaa5b7a Mon Sep 17 00:00:00 2001 From: yzds <41983536+youzhedian@users.noreply.github.com> Date: Sat, 6 Sep 2025 13:24:05 +0800 Subject: [PATCH 11/23] [Feature] Support Decode Context Parallel (DCP) for MLA (#23734) Signed-off-by: hongchao Signed-off-by: youkaichao Co-authored-by: hongchao Co-authored-by: youkaichao --- .buildkite/test-pipeline.yaml | 3 +- csrc/cache.h | 7 - csrc/cache_kernels.cu | 103 ------ csrc/torch_bindings.cpp | 10 - tests/distributed/test_context_parallel.py | 263 ++++++++++++++ vllm/_custom_ops.py | 14 - vllm/attention/ops/common.py | 139 ++++++++ vllm/attention/ops/flashmla.py | 4 +- vllm/config/parallel.py | 5 + vllm/distributed/parallel_state.py | 49 ++- vllm/engine/arg_utils.py | 17 + vllm/v1/attention/backends/mla/common.py | 332 ++++++++++++++++-- vllm/v1/attention/backends/mla/cutlass_mla.py | 18 +- .../attention/backends/mla/flashattn_mla.py | 13 +- vllm/v1/attention/backends/mla/flashmla.py | 18 +- .../attention/backends/mla/rocm_aiter_mla.py | 15 +- vllm/v1/attention/backends/mla/triton_mla.py | 15 +- vllm/v1/core/kv_cache_coordinator.py | 70 ++-- vllm/v1/core/kv_cache_manager.py | 9 + vllm/v1/core/kv_cache_utils.py | 6 + vllm/v1/core/sched/scheduler.py | 10 + vllm/v1/core/single_type_kv_cache_manager.py | 19 +- vllm/v1/kv_cache_interface.py | 8 + vllm/v1/worker/block_table.py | 59 +++- vllm/v1/worker/gpu_model_runner.py | 11 + vllm/v1/worker/gpu_worker.py | 6 +- vllm/worker/worker.py | 6 +- 27 files changed, 999 insertions(+), 230 deletions(-) create mode 100644 tests/distributed/test_context_parallel.py create mode 100644 vllm/attention/ops/common.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ad240023a0030..b0d4c4456d339 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -837,7 +837,7 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins -- label: Pipeline Parallelism Test # 45min +- label: Pipeline + Context Parallelism Test # 45min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" @@ -851,6 +851,7 @@ steps: commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py + # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support - label: LoRA TP Test (Distributed) # 17 min timeout_in_minutes: 30 diff --git a/csrc/cache.h b/csrc/cache.h index e8e069aefd9c5..fd230bec27fca 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -36,13 +36,6 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe, const std::string& kv_cache_dtype, torch::Tensor& scale); -void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe, - torch::Tensor& cp_local_token_select_indices, - torch::Tensor& kv_cache, - torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype, - torch::Tensor& scale); - // Just for unittest void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, const double scale, const std::string& kv_cache_dtype); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index fbb022464ef27..80b4c47c55476 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -396,51 +396,6 @@ __global__ void concat_and_cache_mla_kernel( copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); } -template -__global__ void cp_fused_concat_and_cache_mla_kernel( - const scalar_t* __restrict__ kv_c, // [num_full_tokens, kv_lora_rank] - const scalar_t* __restrict__ k_pe, // [num_full_tokens, pe_dim] - const int64_t* __restrict__ cp_local_token_select_indices, // [num_tokens] - cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank - // + pe_dim)] - const int64_t* __restrict__ slot_mapping, // [num_tokens] - const int block_stride, // - const int entry_stride, // - const int kv_c_stride, // - const int k_pe_stride, // - const int kv_lora_rank, // - const int pe_dim, // - const int block_size, // - const float* scale // -) { - const int64_t token_idx = cp_local_token_select_indices[blockIdx.x]; - const int64_t slot_idx = slot_mapping[blockIdx.x]; - // NOTE: slot_idx can be -1 if the token is padded - if (slot_idx < 0) { - return; - } - const int64_t block_idx = slot_idx / block_size; - const int64_t block_offset = slot_idx % block_size; - - auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst, - int src_stride, int dst_stride, int size, int offset) { - for (int i = threadIdx.x; i < size; i += blockDim.x) { - const int64_t src_idx = token_idx * src_stride + i; - const int64_t dst_idx = - block_idx * block_stride + block_offset * entry_stride + i + offset; - if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - dst[dst_idx] = src[src_idx]; - } else { - dst[dst_idx] = - fp8::scaled_convert(src[src_idx], *scale); - } - } - }; - - copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0); - copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); -} - } // namespace vllm // KV_T is the data type of key and value tensors. @@ -554,20 +509,6 @@ void reshape_and_cache_flash( kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ reinterpret_cast(scale.data_ptr())); -// KV_T is the data type of key and value tensors. -// CACHE_T is the stored data type of kv-cache. -// KV_DTYPE is the real data type of kv-cache. -#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \ - vllm::cp_fused_concat_and_cache_mla_kernel \ - <<>>( \ - reinterpret_cast(kv_c.data_ptr()), \ - reinterpret_cast(k_pe.data_ptr()), \ - cp_local_token_select_indices.data_ptr(), \ - reinterpret_cast(kv_cache.data_ptr()), \ - slot_mapping.data_ptr(), block_stride, entry_stride, \ - kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ - reinterpret_cast(scale.data_ptr())); - void concat_and_cache_mla( torch::Tensor& kv_c, // [num_tokens, kv_lora_rank] torch::Tensor& k_pe, // [num_tokens, pe_dim] @@ -606,50 +547,6 @@ void concat_and_cache_mla( CALL_CONCAT_AND_CACHE_MLA); } -// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel -// calls into one: -// k_c_normed.index_select(0, cp_local_token_select_indices) + \ -// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \ -// concat_and_cache_mla. -void cp_fused_concat_and_cache_mla( - torch::Tensor& kv_c, // [num_total_tokens, kv_lora_rank] - torch::Tensor& k_pe, // [num_total_tokens, pe_dim] - torch::Tensor& cp_local_token_select_indices, // [num_tokens] - torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank + - // pe_dim)] - torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] - const std::string& kv_cache_dtype, torch::Tensor& scale) { - // NOTE(woosuk): In vLLM V1, key.size(0) can be different from - // slot_mapping.size(0) because of padding for CUDA graphs. - // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because - // both include padding. - // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0) - // since key includes padding for CUDA graphs, while slot_mapping does not. - // In this case, slot_mapping.size(0) represents the actual number of tokens - // before padding. - // For compatibility with both cases, we use slot_mapping.size(0) as the - // number of tokens. - int num_tokens = slot_mapping.size(0); - int kv_lora_rank = kv_c.size(1); - int pe_dim = k_pe.size(1); - int block_size = kv_cache.size(1); - - TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim); - - int kv_c_stride = kv_c.stride(0); - int k_pe_stride = k_pe.stride(0); - int block_stride = kv_cache.stride(0); - int entry_stride = kv_cache.stride(1); - - dim3 grid(num_tokens); - dim3 block(std::min(kv_lora_rank, 512)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, - CALL_CP_FUSED_CONCAT_AND_CACHE_MLA); -} - namespace vllm { template diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index b769c09adc0f0..95fb5b197f534 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -693,16 +693,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor scale) -> ()"); cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla); - cache_ops.def( - "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe," - " Tensor cp_local_token_select_indices," - " Tensor! kv_cache," - " Tensor slot_mapping," - " str kv_cache_dtype," - " Tensor scale) -> ()"); - cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA, - &cp_fused_concat_and_cache_mla); - // Convert the key and value cache to fp8 data type. cache_ops.def( "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, " diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py new file mode 100644 index 0000000000000..23be703a30682 --- /dev/null +++ b/tests/distributed/test_context_parallel.py @@ -0,0 +1,263 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +WARNING: This test runs in both single-node (4 GPUs) and multi-node + (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is + important to set the distributed backend to "mp" to avoid Ray scheduling + all workers in a node other than the head node, which can cause the test + to fail. +""" +import json +import os +from dataclasses import dataclass +from typing import Literal, NamedTuple, Optional + +import pytest + +from vllm.config import RunnerOption +from vllm.logger import init_logger + +from ..models.registry import HF_EXAMPLE_MODELS +from ..utils import compare_two_settings, create_new_process_for_each_test + +logger = init_logger("test_context_parallel") + +VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" + + +class ParallelSetup(NamedTuple): + tp_size: int + pp_size: int + dcp_size: int + eager_mode: bool + chunked_prefill: bool + + +class CPTestOptions(NamedTuple): + multi_node_only: bool + load_format: Optional[str] = None + + +@dataclass +class CPTestSettings: + parallel_setups: list[ParallelSetup] + # NOTE: the length of distributed_backends and + # vllm_major_versions should be the same, and they + # are first zipped together to iterate over all + # test settings. + distributed_backends: list[str] + # vllm major version: "0" for V0, "1" for V1 + vllm_major_versions: list[str] + runner: RunnerOption + test_options: CPTestOptions + + def __post_init__(self): + if len(self.distributed_backends) != len(self.vllm_major_versions): + raise ValueError( + f"Length mismatch: distributed_backends " + f"({len(self.distributed_backends)}) != " + f"vllm_major_versions ({len(self.vllm_major_versions)})") + + @staticmethod + def detailed( + *, + tp_base: int = 4, + pp_base: int = 1, + dcp_base: int = 1, + multi_node_only: bool = False, + runner: RunnerOption = "auto", + load_format: Optional[str] = None, + ): + parallel_setups = [] + for eager_mode_val in [False]: + for pp_multiplier in [1]: + for dcp_multiplier in [2, 4]: + for chunked_prefill_val in [True]: + parallel_setups.append( + ParallelSetup(tp_size=tp_base, + pp_size=pp_multiplier * pp_base, + dcp_size=dcp_multiplier * dcp_base, + eager_mode=eager_mode_val, + chunked_prefill=chunked_prefill_val)) + return CPTestSettings( + parallel_setups=parallel_setups, + distributed_backends=["mp"], + vllm_major_versions=["1"], + runner=runner, + test_options=CPTestOptions(multi_node_only=multi_node_only, + load_format=load_format), + ) + + def iter_params(self, model_id: str): + opts = self.test_options + + for parallel_setup in self.parallel_setups: + for backend, vllm_major_version in zip(self.distributed_backends, + self.vllm_major_versions): + yield (model_id, parallel_setup, backend, vllm_major_version, + self.runner, opts) + + +def _compare_cp_with_tp( + model_id: str, + parallel_setup: ParallelSetup, + distributed_backend: str, + vllm_major_version: str, + runner: RunnerOption, + test_options: CPTestOptions, + num_gpus_available: int, + *, + method: Literal["generate"], + is_multimodal: bool, +): + ( + tp_size, + pp_size, + dcp_size, + eager_mode, + chunked_prefill, + ) = parallel_setup + + multi_node_only, load_format = test_options + + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) + model_info.check_transformers_version(on_fail="skip") + + trust_remote_code = model_info.trust_remote_code + tokenizer_mode = model_info.tokenizer_mode + hf_overrides = model_info.hf_overrides + + if load_format == "dummy": + # Avoid OOM + text_overrides = { + "num_hidden_layers": 4, + "hidden_size": 512, + "intermediate_size": 800, + "num_attention_heads": 4, + "num_key_value_heads": 1, + } + + if is_multimodal: + hf_overrides.update({"text_config": text_overrides}) + else: + hf_overrides.update(text_overrides) + else: + model_info.check_available_online(on_fail="skip") + + if num_gpus_available < tp_size * pp_size: + pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") + if VLLM_MULTI_NODE and distributed_backend == "mp": + pytest.skip("Skipping multi-node pipeline parallel test for " + "multiprocessing distributed backend") + if multi_node_only and not VLLM_MULTI_NODE: + pytest.skip("Not in multi-node setting") + + common_args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "8", + ] + if chunked_prefill: + common_args.append("--enable-chunked-prefill") + if eager_mode: + common_args.append("--enforce-eager") + if runner != "auto": + common_args.extend(["--runner", runner]) + if trust_remote_code: + common_args.append("--trust-remote-code") + if tokenizer_mode: + common_args.extend(["--tokenizer-mode", tokenizer_mode]) + if load_format: + common_args.extend(["--load-format", load_format]) + if hf_overrides: + common_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) + + cp_env = tp_env = { + "VLLM_USE_V1": + vllm_major_version, # Note(hc): DCP only support V1 engine only + } + + cp_args = [ + *common_args, + "--tensor-parallel-size", + str(tp_size), + "--pipeline-parallel-size", + str(pp_size), + "--decode-context-parallel-size", + str(dcp_size), + "--distributed-executor-backend", + distributed_backend, + ] + + tp_args = [ + *common_args, + "--tensor-parallel-size", + str(tp_size), + "--pipeline-parallel-size", + str(pp_size), + "--distributed-executor-backend", + distributed_backend, + ] + + try: + compare_two_settings(model_id, + cp_args, + tp_args, + cp_env, + tp_env, + method=method, + max_wait_seconds=720) + except Exception: + testing_ray_compiled_graph = cp_env is not None + if testing_ray_compiled_graph and vllm_major_version == "0": + # Ray Compiled Graph tests are flaky for V0, + # so we don't want to fail the test + logger.exception("Ray Compiled Graph tests failed") + else: + raise + + +CP_TEXT_GENERATION_MODELS = { + # [MLA attention only] + "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(), +} + +CP_TEST_MODELS = [ + # TODO support other models + # [LANGUAGE GENERATION] + "deepseek-ai/DeepSeek-V2-Lite-Chat", +] + + +@pytest.mark.parametrize( + ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", + "runner", "test_options"), + [ + params for model_id, settings in CP_TEXT_GENERATION_MODELS.items() + for params in settings.iter_params(model_id) + if model_id in CP_TEST_MODELS + ], +) +@create_new_process_for_each_test() +def test_cp_generation( + model_id: str, + parallel_setup: ParallelSetup, + distributed_backend: str, + vllm_major_version: str, + runner: RunnerOption, + test_options: CPTestOptions, + num_gpus_available, +): + _compare_cp_with_tp(model_id, + parallel_setup, + distributed_backend, + vllm_major_version, + runner, + test_options, + num_gpus_available, + method="generate", + is_multimodal=False) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index bb67d5790aaaa..545f4cb48bf47 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1625,20 +1625,6 @@ def concat_and_cache_mla( scale) -def cp_fused_concat_and_cache_mla( - kv_c: torch.Tensor, - k_pe: torch.Tensor, - cp_local_token_select_indices: torch.Tensor, - kv_cache: torch.Tensor, - slot_mapping: torch.Tensor, - kv_cache_dtype: str, - scale: torch.Tensor, -) -> None: - torch.ops._C_cache_ops.cp_fused_concat_and_cache_mla( - kv_c, k_pe, cp_local_token_select_indices, kv_cache, slot_mapping, - kv_cache_dtype, scale) - - def copy_blocks(key_caches: list[torch.Tensor], value_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py new file mode 100644 index 0000000000000..189b57e8e8b82 --- /dev/null +++ b/vllm/attention/ops/common.py @@ -0,0 +1,139 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch + +from vllm.distributed.parallel_state import GroupCoordinator +from vllm.triton_utils import tl, triton + + +@triton.jit +def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr, + vlse_ptr, outputs_stride_B, outputs_stride_H, + outputs_stride_D, lses_stride_N, lses_stride_B, + lses_stride_H, lse_idx, HEAD_DIM: tl.constexpr, + N_ROUNDED: tl.constexpr): + """ + Apply the all-gathered lses to correct each local rank's attention + output. we still need perform a cross-rank reduction to obtain the + final attention output. + + Args: + output: [ B, H, D ] + lses : [ N, B, H ] + cp, batch, q_heads, v_head_dim + Return: + output: [ B, H, D ] + lse : [ B, H ] + """ + batch_idx = tl.program_id(axis=0).to(tl.int64) + head_idx = tl.program_id(axis=1).to(tl.int64) + d_offsets = tl.arange(0, HEAD_DIM) + num_n_offsets = tl.arange(0, N_ROUNDED) + + # shape = [N] + lse_offsets = num_n_offsets * lses_stride_N + batch_idx * \ + lses_stride_B + head_idx * lses_stride_H + + # calc final lse + lse = tl.load(lses_ptr + lse_offsets) + lse = tl.where((lse != lse) | (lse == float('inf')), -float('inf'), lse) + lse_max = tl.max(lse, axis=0) + lse -= lse_max + lse_exp = tl.exp(lse) + lse_acc = tl.sum(lse_exp, axis=0) + lse = tl.log(lse_acc) + lse += lse_max + + lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H + tl.store(vlse_ptr + lse_offsets, lse) + + # shape = [D] + output_offsets = batch_idx * outputs_stride_B + \ + head_idx * outputs_stride_H + \ + d_offsets * outputs_stride_D + + # correct output + lse_offset = lse_idx * lses_stride_N + batch_idx * \ + lses_stride_B + head_idx * lses_stride_H + lse_tmp = tl.load(lses_ptr + lse_offset) + lse_finally = lse_tmp - lse + lse_finally = tl.where( + (lse_finally != lse_finally) | (lse_finally == float('inf')), + -float('inf'), lse_finally) + factor = tl.exp(lse_finally) + output = tl.load(outputs_ptr + output_offsets) + output = output * factor + + tl.store(new_output_ptr + output_offsets, output) + + +class CPTritonContext: + """ The CPTritonContext is used to avoid recompilation of the Triton JIT. + """ + + def __init__(self): + self.inner_kernel = None + + def call_kernel(self, kernel, grid, *regular_args, **const_args): + if self.inner_kernel is None: + self.inner_kernel = kernel[grid](*regular_args, **const_args) + else: + self.inner_kernel[grid](*regular_args) + + +def correct_attn_out(out: torch.Tensor, lses: torch.Tensor, cp_rank: int, + ctx: CPTritonContext): + """ + Apply the all-gathered lses to correct each local rank's attention + output. we still need perform a cross-rank reduction to obtain the + final attention output. + + Args: + output: [ B, H, D ] + lses : [ N, B, H ] + Return: + output: [ B, H, D ] + lse : [ B, H ] + """ + if ctx is None: + ctx = CPTritonContext() + + lse = torch.empty_like(lses[0]) + + grid = (out.shape[0], out.shape[1], 1) + regular_args = (out, out, lses, lse, *out.stride(), *lses.stride(), + cp_rank) + const_args = { + "HEAD_DIM": out.shape[-1], + "N_ROUNDED": lses.shape[0], + } + + ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, + **const_args) + return out, lse + + +def cp_lse_ag_out_rs(cp_attn_out: torch.Tensor, + cp_attn_lse: torch.Tensor, + cp_group: GroupCoordinator, + ctx: CPTritonContext = None): + """ + cp_attn_out: [ B, H, D ] + cp_attn_lse: [ B, H ] + """ + if cp_group.world_size == 1: + return cp_attn_out + + if ctx is None: + ctx = CPTritonContext() + + lses = torch.empty((cp_group.world_size, ) + cp_attn_lse.shape, + dtype=cp_attn_lse.dtype, + device=cp_attn_lse.device) + + cp_attn_lse = cp_attn_lse.contiguous() + lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) + out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) + assert out.is_contiguous() + out = cp_group.reduce_scatter(out, dim=1) + return out diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py index 564042cf8eb12..2c3e8c42400ce 100644 --- a/vllm/attention/ops/flashmla.py +++ b/vllm/attention/ops/flashmla.py @@ -105,7 +105,9 @@ def flash_mla_with_kvcache( descale_q, descale_k, ) - return out, softmax_lse + + # Note(hc): need revisit when we support DCP with decode query_len > 1. + return out.squeeze(1), softmax_lse.squeeze(-1) # diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 9d4594bab3c17..fb8e30996ea33 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -170,6 +170,11 @@ class ParallelConfig: Set to be private as it's not intended to be configured by users. """ + decode_context_parallel_size: int = 1 + """Number of decode context parallel groups, because the world size does + not change by dcp, it simply reuse the GPUs of TP group, and tp_size + needs to be divisible by dcp_size.""" + @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index fc96c2ac926b0..522dfc8d8b5a0 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -904,6 +904,18 @@ def get_tensor_model_parallel_group(): return get_tp_group() +_DCP: Optional[GroupCoordinator] = None + + +def get_dcp_group() -> GroupCoordinator: + assert _DCP is not None, ( + "decode context model parallel group is not initialized") + return _DCP + + +# kept for backward compatibility +get_context_model_parallel_group = get_dcp_group + _PP: Optional[GroupCoordinator] = None _DP: Optional[GroupCoordinator] = None @@ -1034,6 +1046,7 @@ def init_distributed_environment( def initialize_model_parallel( tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, + decode_context_model_parallel_size: Optional[int] = 1, backend: Optional[str] = None, ) -> None: """ @@ -1098,6 +1111,23 @@ def initialize_model_parallel( use_message_queue_broadcaster=True, group_name="tp") + # Build the DCP model-parallel groups. + global _DCP + assert _DCP is None, ( + "decode context model parallel group is already initialized") + # Note(hc): In the current implementation of decode context parallel, + # dcp_size must not exceed tp_size, because the world size does not + # change by DCP, it simply reuse the GPUs of TP group, and split one + # TP group into tp_size//dcp_size DCP groups. + group_ranks = all_ranks.reshape( + -1, decode_context_model_parallel_size).unbind(0) + group_ranks = [x.tolist() for x in group_ranks] + _DCP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="dcp") + # Build the pipeline model-parallel groups. global _PP assert _PP is None, ( @@ -1141,6 +1171,7 @@ def initialize_model_parallel( def ensure_model_parallel_initialized( tensor_model_parallel_size: int, pipeline_model_parallel_size: int, + decode_context_model_parallel_size: Optional[int] = 1, backend: Optional[str] = None, ) -> None: """Helper to initialize model parallel groups if they are not initialized, @@ -1151,7 +1182,8 @@ def ensure_model_parallel_initialized( get_world_group().device_group) if not model_parallel_is_initialized(): initialize_model_parallel(tensor_model_parallel_size, - pipeline_model_parallel_size, backend) + pipeline_model_parallel_size, + decode_context_model_parallel_size, backend) return assert ( @@ -1226,6 +1258,16 @@ def get_tensor_model_parallel_rank(): return get_tp_group().rank_in_group +def get_decode_context_model_parallel_world_size(): + """Return world size for the decode context model parallel group.""" + return get_dcp_group().world_size + + +def get_decode_context_model_parallel_rank(): + """Return my rank for the decode context model parallel group.""" + return get_dcp_group().rank_in_group + + def get_node_count() -> int: """Return the total number of nodes in the distributed environment. """ assert _NODE_COUNT is not None, ( @@ -1246,6 +1288,11 @@ def destroy_model_parallel(): _PP.destroy() _PP = None + global _DCP + if _DCP: + _DCP.destroy() + _DCP = None + global _DP if _DP: _DP.destroy() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 71ee90040f374..d96654ecfa466 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -306,6 +306,8 @@ class EngineArgs: # number of P/D disaggregation (or other disaggregation) workers pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size tensor_parallel_size: int = ParallelConfig.tensor_parallel_size + decode_context_parallel_size: int = \ + ParallelConfig.decode_context_parallel_size data_parallel_size: int = ParallelConfig.data_parallel_size data_parallel_rank: Optional[int] = None data_parallel_start_rank: Optional[int] = None @@ -636,6 +638,9 @@ class EngineArgs: **parallel_kwargs["pipeline_parallel_size"]) parallel_group.add_argument("--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"]) + parallel_group.add_argument( + "--decode-context-parallel-size", "-dcp", + **parallel_kwargs["decode_context_parallel_size"]) parallel_group.add_argument("--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]) parallel_group.add_argument( @@ -1156,6 +1161,17 @@ class EngineArgs: # global layers in interleaved sliding window models. sliding_window = model_config.get_sliding_window() + # Note(hc): In the current implementation of decode context + # parallel(DCP), tp_size needs to be divisible by dcp_size, + # because the world size does not change by dcp, it simply + # reuse the GPUs of TP group, and split one TP group into + # tp_size//dcp_size DCP groups. + assert self.tensor_parallel_size % self.decode_context_parallel_size \ + == 0, ( + f"tp_size={self.tensor_parallel_size} must be divisible by" + f"dcp_size={self.decode_context_parallel_size}." + ) + cache_config = CacheConfig( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, @@ -1306,6 +1322,7 @@ class EngineArgs: distributed_executor_backend=self.distributed_executor_backend, worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, + decode_context_parallel_size=self.decode_context_parallel_size, ) speculative_config = self.create_speculative_config( diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 9696b6c0913c4..090ebf93840d8 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -201,10 +201,11 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, AttentionMetadata, MLAAttentionImpl) from vllm.attention.backends.utils import get_mla_dims +from vllm.attention.ops.common import cp_lse_ag_out_rs from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import VllmConfig -from vllm.distributed.parallel_state import is_global_first_rank +from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearBase, @@ -323,6 +324,13 @@ class MLACommonPrefillMetadata: seq_lens: torch.Tensor workspace: torch.Tensor + # for mla DCP + cp_chunk_seq_lens: Optional[list[list[int]]] = None + origin_context_lens: Optional[list[int]] = None + cp_cu_seq_lens: Optional[torch.Tensor] = None + chunk_size: Optional[int] = None + cu_seq_lens_lst: Optional[list[list[int]]] = None + block_table: torch.Tensor query_start_loc: torch.Tensor max_query_len: int @@ -444,6 +452,13 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): parallel_config) self.mla_dims = get_mla_dims(self.model_config) self.aot_schedule = current_platform.is_cuda() + try: + self.dcp_world_size = get_dcp_group().world_size + self.dcp_rank = get_dcp_group().rank_in_group + except AssertionError: + # DCP might not be initialized in testing + self.dcp_world_size = 1 + self.dcp_rank = 0 # Dont try to access the runner on AMD if self.aot_schedule: @@ -465,12 +480,27 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): 128 * 1024) assert self.chunked_prefill_workspace_size >= \ scheduler_config.max_num_seqs * cache_config.block_size - self.chunked_prefill_workspace = torch.empty( - (self.chunked_prefill_workspace_size, - self.model_config.get_head_size()), - dtype=self.model_config.dtype, - device=device, - ) + if self.dcp_world_size > 1: + # Note(hc): The local kvcache is incomplete when DCP is triggered, + # an additional kvcache allgather across the DCP group is therefore + # required, so the workspace has to be enlarged by 1/DCP relative + # to the original TP allocation. + assert self.chunked_prefill_workspace_size % \ + self.dcp_world_size == 0 + self.chunked_prefill_workspace = torch.empty( + (self.chunked_prefill_workspace_size + + self.chunked_prefill_workspace_size // self.dcp_world_size, + self.model_config.get_head_size()), + dtype=self.model_config.dtype, + device=device, + ) + else: + self.chunked_prefill_workspace = torch.empty( + (self.chunked_prefill_workspace_size, + self.model_config.get_head_size()), + dtype=self.model_config.dtype, + device=device, + ) self._use_cudnn_prefill = use_cudnn_prefill() self._use_fi_prefill = use_flashinfer_prefill() @@ -631,6 +661,12 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): split_decodes_and_prefills(common_attn_metadata, decode_threshold=self.reorder_batch_threshold) + # Note(hc): update seq_lens of decode reqs under DCP. + if self.dcp_world_size > 1: + seq_lens[:num_decodes] = seq_lens[:num_decodes] \ + // self.dcp_world_size + (self.dcp_rank <= \ + (seq_lens[:num_decodes] - 1) % self.dcp_world_size) + assert num_decodes + num_prefills == num_reqs assert num_decode_tokens + num_prefill_tokens == num_tokens @@ -639,6 +675,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): reqs_start = num_decodes # prefill_start context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs] + # Note(hc): The context lengths in the perspective of dcp rank0. + cp_context_lens_cpu = torch.ceil(context_lens_cpu.float() / + self.dcp_world_size).int() + origin_context_lens = context_lens_cpu.tolist() max_context_len_cpu = context_lens_cpu.max().item() num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item() prefill_query_start_loc = query_start_loc[ @@ -691,20 +731,66 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32) + if self.dcp_world_size > 1: + # Note(hc): The above max_context_chunk already enforces + # block_size alignment, DCP just need the block_size can + # be divisible by dcp_world_size, because DCP use + # cp_gather_cache which not require `cp_chunk_starts` + # aligned to page_size. + assert max_context_chunk % self.dcp_world_size == 0 + cp_max_context_chunk = max_context_chunk // \ + self.dcp_world_size + cp_chunk_starts = \ + torch.arange(num_chunks, dtype=torch.int32) \ + .unsqueeze(1).expand(-1, num_prefills) \ + * cp_max_context_chunk + cp_chunk_ends = torch.min( + cp_context_lens_cpu.unsqueeze(0), + cp_chunk_starts + cp_max_context_chunk) + cp_chunk_seq_lens = (cp_chunk_ends - + cp_chunk_starts).clamp(min=0) + + cp_cu_seq_lens_cpu = torch.zeros(num_chunks, + num_prefills + 1, + dtype=torch.int32, + pin_memory=True) + torch.cumsum(cp_chunk_seq_lens, + dim=1, + out=cp_cu_seq_lens_cpu[:, 1:], + dtype=torch.int32) + chunked_context_metadata_cls = \ CudnnPrefillMetadata.ChunkedContextMetadata \ if self._use_cudnn_prefill else \ MLACommonPrefillMetadata.ChunkedContextMetadata - - chunked_context_metadata = \ - chunked_context_metadata_cls( - cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True), - starts=chunk_starts.to(device, non_blocking=True), - seq_tot=chunk_seq_lens.sum(dim=1).tolist(), - max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(), - seq_lens=chunk_seq_lens, - workspace=self.chunked_prefill_workspace, - ) + if self.dcp_world_size > 1: + chunked_context_metadata = \ + chunked_context_metadata_cls( + cu_seq_lens=cu_seq_lens_cpu \ + .to(device, non_blocking=True), + starts=cp_chunk_starts.to(device, non_blocking=True), + seq_tot=cp_chunk_seq_lens.sum(dim=1).tolist(), + max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(), + seq_lens=chunk_seq_lens, + workspace=self.chunked_prefill_workspace, + cp_chunk_seq_lens=cp_chunk_seq_lens.tolist(), + origin_context_lens=origin_context_lens, + cp_cu_seq_lens=cp_cu_seq_lens_cpu \ + .to(device, non_blocking=True), + chunk_size=max_context_chunk, + cu_seq_lens_lst=cu_seq_lens_cpu.tolist(), + ) + else: + chunked_context_metadata = \ + chunked_context_metadata_cls( + cu_seq_lens=cu_seq_lens_cpu \ + .to(device, non_blocking=True), + starts=chunk_starts.to(device, non_blocking=True), + seq_tot=chunk_seq_lens.sum(dim=1).tolist(), + max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(), + seq_lens=chunk_seq_lens, + workspace=self.chunked_prefill_workspace, + ) if self._use_cudnn_prefill: chunked_context_metadata.seq_lens = chunk_seq_lens @@ -757,6 +843,71 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): return attn_metadata +def reorg_kvcache( + allgatered_kv_c_normed: torch.Tensor, + allgatered_k_pe: torch.Tensor, + cp_chunk_seq_lens_lst: list[int], + origin_context_lens: list[int], + cp_world_size: int, + sum_seq_len: int, + max_seq_len: int, + chunk_size: int, + chunk_idx: int, + toks: int, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + reorg kvcache after cp local gather to tp layout for attn kernel. + + Args: + cp_chunk_seq_lens_lst: chunk context lengths under CP. + origin_context_lens: origin full context lengths under CP. + cp_world_size: CP size. + sum_seq_len: the sum of cp_chunk_seq_lens_lst. + max_seq_len: the max value of cp_chunk_seq_lens_lst. + chunk_size: equals to max_context_chunk from + chunked_context_metadata building. + chunk_idx: chunk idx of chunked_prefill. + toks: the number of tokens for local gather cache. + """ + kv_c_segments = [] + k_pe_segments = [] + src_token_idx = 0 + max_seq_len_check = 0 + for cp_chunk_seq_len, origin_context_len in zip(cp_chunk_seq_lens_lst, + origin_context_lens): + chunk_context_len = chunk_size + if cp_chunk_seq_len != 0: + chunk_context_len = min( + chunk_context_len, origin_context_len - chunk_size * chunk_idx) + cp_target_rank = (chunk_context_len - 1) % cp_world_size + cur_seq_len = 0 + for rank in range(cp_world_size): + if rank > cp_target_rank and cp_chunk_seq_len: + real_cp_chunk_seq_len = cp_chunk_seq_len - 1 + else: + real_cp_chunk_seq_len = cp_chunk_seq_len + if real_cp_chunk_seq_len: + kv_c_segment = allgatered_kv_c_normed[rank * toks + + src_token_idx:rank * + toks + src_token_idx + + real_cp_chunk_seq_len] + k_pe_segment = allgatered_k_pe[rank * toks + + src_token_idx:rank * toks + + src_token_idx + + real_cp_chunk_seq_len] + kv_c_segments.append(kv_c_segment) + k_pe_segments.append(k_pe_segment) + cur_seq_len += real_cp_chunk_seq_len + max_seq_len_check = max(max_seq_len_check, cur_seq_len) + src_token_idx += cp_chunk_seq_len + reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0) + reorganized_k_pe = torch.cat(k_pe_segments, dim=0) + assert reorganized_kv_c_normed.shape[0] == sum_seq_len + assert reorganized_k_pe.shape[0] == sum_seq_len + assert max_seq_len_check == max_seq_len + return reorganized_kv_c_normed, reorganized_k_pe + + class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): """ NOTE: Please read the comment at the top of the file before trying to @@ -836,6 +987,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): self.vllm_flash_attn_version == 3 and current_platform.get_device_capability()[0] == 9) + self.dcp_world_size: Optional[int] = None + def _flash_attn_varlen_diff_headdims(self, q, k, @@ -1152,6 +1305,108 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): return output, output_lse + def _context_parallel_compute_prefill_context( + self, + q: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: MLACommonMetadata, + k_scale: torch.Tensor, + dcp_world_size: int, + ): + assert k_scale is None, "DCP not support sacled kvcache now." + assert attn_metadata.prefill is not None + prefill_metadata = attn_metadata.prefill + assert prefill_metadata.chunked_context is not None + assert prefill_metadata.chunked_context.cp_chunk_seq_lens is not None + assert prefill_metadata.chunked_context.origin_context_lens is not None + assert prefill_metadata.chunked_context.cp_cu_seq_lens is not None + assert prefill_metadata.chunked_context.chunk_size is not None + assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None + + output = None + iters = len(prefill_metadata.chunked_context.seq_tot) + workspace = prefill_metadata.chunked_context.workspace + + for i in range(iters): + toks = prefill_metadata.chunked_context.seq_tot[i] + ops.cp_gather_cache( + src_cache=kv_c_and_k_pe_cache, + dst=workspace, + block_table=prefill_metadata.block_table, + cu_seq_lens=prefill_metadata.chunked_context.cp_cu_seq_lens[i], + batch_size=attn_metadata.num_prefills, + seq_starts=prefill_metadata.chunked_context.starts[i], + ) + # workspace + # |------- N tokens --------|--------- N*dcp_size tokens ----------| + # |<- use for loca_gather ->|<--------- use for allgather -------->| + allgather_offset = workspace.shape[0] // (dcp_world_size + 1) + assert allgather_offset * (dcp_world_size + + 1) == workspace.shape[0] + assert toks <= allgather_offset + local_gathered_kvcache = workspace[:toks] + cur_allgather_workspace = workspace[ + allgather_offset:allgather_offset * (1 + dcp_world_size)] + assert toks * dcp_world_size <= cur_allgather_workspace.shape[0] + cur_allgather_kvcache = cur_allgather_workspace[:toks * + dcp_world_size] + cur_allgather_kvcache.copy_(get_dcp_group().all_gather( + local_gathered_kvcache, dim=0)) + assert cur_allgather_kvcache.shape[ + -1] == self.kv_lora_rank + self.qk_rope_head_dim + allgatered_kv_c_normed, allgatered_k_pe = \ + cur_allgather_kvcache.unsqueeze( + 1).split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + + kv_c_normed, k_pe = reorg_kvcache( + allgatered_kv_c_normed, + allgatered_k_pe, + cp_chunk_seq_lens_lst=prefill_metadata.chunked_context. + cp_chunk_seq_lens[i], + origin_context_lens=prefill_metadata.chunked_context. + origin_context_lens, + cp_world_size=dcp_world_size, + sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i] + [-1], + max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i], + chunk_size=prefill_metadata.chunked_context.chunk_size, + chunk_idx=i, + toks=toks) + + kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \ + -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) + k_nope, v = kv_nope\ + .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), + dim=-1) + + attn_output, attn_softmax_lse = self._run_prefill_context_chunk( + prefill=prefill_metadata, + chunk_idx=i, + q=q, + k=k, + v=v, + ) + + if output is None: + output = attn_output + output_lse = attn_softmax_lse + else: + output_tmp = torch.empty_like(output) + output_lse_tmp = torch.empty_like(output_lse) + merge_attn_states( + output=output_tmp, + output_lse=output_lse_tmp, + prefix_output=output, + prefix_lse=output_lse, + suffix_output=attn_output, + suffix_lse=attn_softmax_lse, + ) + output = output_tmp + output_lse = output_lse_tmp + + return output, output_lse + def _forward_prefill( self, q: torch.Tensor, @@ -1162,6 +1417,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): k_scale: torch.Tensor, ) -> torch.Tensor: assert attn_metadata.prefill is not None + assert self.dcp_world_size is not None has_context = attn_metadata.prefill.chunked_context is not None kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\ @@ -1181,8 +1437,15 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): if has_context: suffix_output, suffix_lse = output - context_output, context_lse = self._compute_prefill_context( \ - q, kv_c_and_k_pe_cache, attn_metadata, k_scale) + if self.dcp_world_size > 1: + context_output, context_lse = \ + self._context_parallel_compute_prefill_context( + q, kv_c_and_k_pe_cache, attn_metadata, + k_scale=None, dcp_world_size=self.dcp_world_size) + else: + context_output, context_lse = \ + self._compute_prefill_context( + q, kv_c_and_k_pe_cache, attn_metadata, k_scale) output = torch.empty_like(suffix_output) merge_attn_states( @@ -1202,12 +1465,11 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): @abstractmethod def _forward_decode( self, - ql_nope: torch.Tensor, - q_pe: torch.Tensor, + q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: M, layer: AttentionLayer, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: raise NotImplementedError def forward( @@ -1235,6 +1497,9 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): # same expert outputs. return output.fill_(0) + if self.dcp_world_size is None: + self.dcp_world_size = get_dcp_group().world_size + fp8_attention = self.kv_cache_dtype.startswith("fp8") num_actual_toks = attn_metadata.num_actual_tokens @@ -1313,7 +1578,26 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): layer._q_scale) decode_q_pe = decode_q_pe.reshape(q_pe_shape) - output[:num_decode_tokens] = self._forward_decode( - decode_ql_nope, decode_q_pe, kv_cache, attn_metadata, layer) + decode_q = (decode_ql_nope, decode_q_pe) + if self.dcp_world_size > 1: + assert not fp8_attention, "DCP not support fp8 kvcache now." + # concatenate decode_ql_nope and decode_q_pe -> (B, N, L + P) + decode_q = torch.cat(decode_q, dim=-1) + # decode_q do allgather in head dim. + decode_q = get_dcp_group().all_gather(decode_q, dim=1) + # call decode attn + attn_out, lse = self._forward_decode(decode_q, kv_cache, + attn_metadata, layer) + + # recorect dcp attn_out with lse. + if self.dcp_world_size > 1: + assert lse is not None, ( + "For a mla backend want to enable" + "DCP, it is mandatory that the corresponding decode attn" + "kernel return the softmax lse.") + attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group()) + + # v_up projection + output[:num_decode_tokens] = self._v_up_proj(attn_out) return output_padded diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 705307d4dea3d..95dce8d8e2eef 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -232,7 +232,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): self._workspace.get_buf(), self.scale, self._num_kv_splits) - return self._v_up_proj(o) + return o # TODO: Currently we leave it here only for backup in case something is # wrong with the new SM100 CUTLASS MLA kernel @@ -265,21 +265,25 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): attn_metadata.decode.seq_lens, attn_metadata.decode.block_table, self.scale) - return self._v_up_proj(o) + return o def _forward_decode( self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, + q: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: MLACommonMetadata, layer: AttentionLayer, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + if type(q) is tuple: + q_nope, q_pe = q + else: + q_nope, q_pe = torch.split( + q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) if self._use_old_cutlass_mla: # TODO: Remove the old cutlass MLA kernel after more extensive # testing return self._old_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache, - attn_metadata) + attn_metadata), None return self._sm100_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache, - attn_metadata) + attn_metadata), None diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 0e08307ddf841..e2a63c2f577e0 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import ClassVar, Optional, Union import torch @@ -154,15 +154,20 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]): def _forward_decode( self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, + q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: FlashAttnMLAMetadata, layer: AttentionLayer, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: assert kv_c_and_k_pe_cache.numel() > 0 assert attn_metadata.decode is not None + if type(q) is tuple: + q_nope, q_pe = q + else: + q_nope, q_pe = torch.split( + q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + if self.kv_cache_dtype.startswith("fp8"): raise NotImplementedError( "FP8 FlashAttention MLA not yet supported") diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index df617ab7a8ea7..11c91b8a0650e 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import ClassVar, Optional, Union import torch @@ -169,20 +169,20 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): def _forward_decode( self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, + q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: FlashMLAMetadata, layer: AttentionLayer, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: assert kv_c_and_k_pe_cache.numel() > 0 assert attn_metadata.decode is not None - q = torch.cat([q_nope, q_pe], dim=-1)\ - .unsqueeze(1) # Add seqlen dim of 1 (decode) + if type(q) is tuple: + q = torch.cat(q, dim=-1) - o, _ = flash_mla_with_kvcache( - q=q, + assert isinstance(q, torch.Tensor) + o, lse = flash_mla_with_kvcache( + q=q.unsqueeze(1), # Add seqlen dim of 1 (decode) k_cache=kv_c_and_k_pe_cache.unsqueeze(-2), # Add head dim of 1 block_table=attn_metadata.decode.block_table, cache_seqlens=attn_metadata.decode.seq_lens, @@ -196,4 +196,4 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): descale_k=layer._k_scale.reshape(1), ) - return self._v_up_proj(o) + return o, lse diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 42670093daa9b..fc6b1998e8eb0 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import ClassVar, Optional, Union import torch @@ -220,18 +220,19 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): def _forward_decode( self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, + q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: AiterMLAMetadata, layer: AttentionLayer, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: assert kv_c_and_k_pe_cache.numel() > 0 assert attn_metadata.decode is not None - B = q_nope.shape[0] + if type(q) is tuple: + q = torch.cat(q, dim=-1) - q = torch.cat([q_nope, q_pe], dim=-1) + assert isinstance(q, torch.Tensor) + B = q.shape[0] o = torch.zeros(B, self.num_heads, self.kv_lora_rank, @@ -249,4 +250,4 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): attn_metadata.decode.paged_kv_indices, attn_metadata.decode.paged_kv_last_page_len) - return self._v_up_proj(o) + return o, None diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index f2974ed668d99..d692b00d78b46 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional +from typing import Optional, Union import torch @@ -123,21 +123,22 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): def _forward_decode( self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, + q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: MLACommonMetadata, layer: AttentionLayer, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: assert kv_c_and_k_pe_cache.numel() > 0 assert attn_metadata.decode is not None if self.kv_cache_dtype.startswith("fp8"): raise NotImplementedError("FP8 Triton MLA not yet supported") - B = q_nope.shape[0] + if type(q) is tuple: + q = torch.cat(q, dim=-1) - q = torch.cat([q_nope, q_pe], dim=-1) + assert isinstance(q, torch.Tensor) + B = q.shape[0] o = torch.zeros(B, self.num_heads, self.kv_lora_rank, @@ -171,4 +172,4 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): attn_metadata.decode.seq_lens, attn_logits, num_kv_splits, self.scale, PAGE_SIZE) - return self._v_up_proj(o) + return o, None diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 9421341f990c8..86771060c4099 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -24,6 +24,7 @@ class KVCacheCoordinator(ABC): use_eagle: bool, enable_caching: bool, enable_kv_cache_events: bool, + dcp_world_size: int, ): self.kv_cache_config = kv_cache_config self.max_model_len = max_model_len @@ -39,6 +40,7 @@ class KVCacheCoordinator(ABC): kv_cache_spec=kv_cache_group.kv_cache_spec, block_pool=self.block_pool, kv_cache_group_id=i, + dcp_world_size=dcp_world_size, ) for i, kv_cache_group in enumerate( self.kv_cache_config.kv_cache_groups)) @@ -197,9 +199,14 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator): """ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, - use_eagle: bool, enable_kv_cache_events: bool): - super().__init__(kv_cache_config, max_model_len, use_eagle, False, - enable_kv_cache_events) + use_eagle: bool, enable_kv_cache_events: bool, + dcp_world_size: int): + super().__init__(kv_cache_config, + max_model_len, + use_eagle, + False, + enable_kv_cache_events, + dcp_world_size=dcp_world_size) self.num_single_type_manager = len(self.single_type_managers) def get_num_common_prefix_blocks(self, request_id: str, @@ -225,12 +232,19 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, enable_caching: bool, - enable_kv_cache_events: bool): - super().__init__(kv_cache_config, max_model_len, use_eagle, - enable_caching, enable_kv_cache_events) + enable_kv_cache_events: bool, dcp_world_size: int): + super().__init__(kv_cache_config, + max_model_len, + use_eagle, + enable_caching, + enable_kv_cache_events, + dcp_world_size=dcp_world_size) self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[ 0].kv_cache_spec self.block_size = self.kv_cache_spec.block_size + self.dcp_world_size = dcp_world_size + if dcp_world_size > 1: + self.block_size *= dcp_world_size assert len(self.kv_cache_config.kv_cache_groups) == 1, ( "UnitaryKVCacheCoordinator assumes only one kv cache group") @@ -246,6 +260,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): block_pool=self.block_pool, kv_cache_spec=self.kv_cache_spec, use_eagle=self.use_eagle, + dcp_world_size=self.dcp_world_size, ) return hit_blocks, len(hit_blocks[0]) * self.block_size @@ -261,9 +276,14 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, enable_caching: bool, - enable_kv_cache_events: bool): - super().__init__(kv_cache_config, max_model_len, use_eagle, - enable_caching, enable_kv_cache_events) + enable_kv_cache_events: bool, dcp_world_size: int): + super().__init__(kv_cache_config, + max_model_len, + use_eagle, + enable_caching, + enable_kv_cache_events, + dcp_world_size=dcp_world_size) + assert dcp_world_size == 1, "DCP not support hybrid attn now." self.verify_and_split_kv_cache_groups() def verify_and_split_kv_cache_groups(self) -> None: @@ -394,17 +414,27 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): return hit_blocks, hit_length -def get_kv_cache_coordinator( - kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, - enable_caching: bool, - enable_kv_cache_events: bool) -> KVCacheCoordinator: +def get_kv_cache_coordinator(kv_cache_config: KVCacheConfig, + max_model_len: int, use_eagle: bool, + enable_caching: bool, + enable_kv_cache_events: bool, + dcp_world_size: int) -> KVCacheCoordinator: if not enable_caching: - return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len, + return KVCacheCoordinatorNoPrefixCache(kv_cache_config, + max_model_len, use_eagle, - enable_kv_cache_events) + enable_kv_cache_events, + dcp_world_size=dcp_world_size) if len(kv_cache_config.kv_cache_groups) == 1: - return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len, - use_eagle, enable_caching, - enable_kv_cache_events) - return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle, - enable_caching, enable_kv_cache_events) + return UnitaryKVCacheCoordinator(kv_cache_config, + max_model_len, + use_eagle, + enable_caching, + enable_kv_cache_events, + dcp_world_size=dcp_world_size) + return HybridKVCacheCoordinator(kv_cache_config, + max_model_len, + use_eagle, + enable_caching, + enable_kv_cache_events, + dcp_world_size=dcp_world_size) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 87a11fe58a048..3a0fbb5e5c41e 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -91,6 +91,7 @@ class KVCacheManager: use_eagle: bool = False, log_stats: bool = False, enable_kv_cache_events: bool = False, + dcp_world_size: int = 1, ) -> None: self.max_model_len = max_model_len @@ -109,12 +110,20 @@ class KVCacheManager: self.block_size = kv_cache_config.kv_cache_groups[ 0].kv_cache_spec.block_size + if dcp_world_size > 1: + assert len(kv_cache_config.kv_cache_groups) == 1 + # Note(hc): need revisit. When both DCP and any future + # PCP are enabled, the block_size may need to be scaled + # by a factor of dcp_size × pcp_size? + self.block_size *= dcp_world_size + self.coordinator = get_kv_cache_coordinator( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, use_eagle=self.use_eagle, enable_caching=self.enable_caching, enable_kv_cache_events=enable_kv_cache_events, + dcp_world_size=dcp_world_size, ) self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups) self.block_pool = self.coordinator.block_pool diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 248ad9cda7c28..aff1183e499a4 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -846,6 +846,12 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, ) num_tokens = num_blocks * vllm_config.cache_config.block_size + if vllm_config.parallel_config.decode_context_parallel_size > 1: + num_tokens *= vllm_config.parallel_config.decode_context_parallel_size + logger.info( + "Multiplying the GPU KV cache size by the dcp_world_size %d.", + vllm_config.parallel_config.decode_context_parallel_size) + num_tokens_str = f"{num_tokens:,}" logger.info("GPU KV cache size: %s tokens", num_tokens_str) max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 8322fa7335b69..31f7e9c70f8b3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -100,6 +100,15 @@ class Scheduler(SchedulerInterface): self.block_size = self.cache_config.block_size + self.dcp_world_size = \ + vllm_config.parallel_config.decode_context_parallel_size + # Note(hc): The scheduler’s block_size must be multiplied + # by dcp_world_size, since block hashes are computed on the + # original full token sequence at a granularity of + # original_block_size × dcp_world_size. + if self.dcp_world_size > 1: + self.block_size *= self.dcp_world_size + # req_id -> Request self.requests: dict[str, Request] = {} # Scheduling policy @@ -161,6 +170,7 @@ class Scheduler(SchedulerInterface): use_eagle=self.use_eagle, log_stats=self.log_stats, enable_kv_cache_events=self.enable_kv_cache_events, + dcp_world_size=self.dcp_world_size, ) self.use_pp = self.parallel_config.pipeline_parallel_size > 1 diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index f6affb3dab66f..8159349e46758 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -25,6 +25,7 @@ class SingleTypeKVCacheManager(ABC): kv_cache_spec: KVCacheSpec, block_pool: BlockPool, kv_cache_group_id: int, + dcp_world_size: int = 1, ) -> None: """ Initializes the SingleTypeKVCacheManager. @@ -33,8 +34,10 @@ class SingleTypeKVCacheManager(ABC): block_pool: The block pool. kv_cache_group_id: The id of the kv cache group of this manager. """ - self.block_size = kv_cache_spec.block_size + self.dcp_world_size = dcp_world_size + if self.dcp_world_size > 1: + self.block_size *= dcp_world_size self.kv_cache_spec = kv_cache_spec self.block_pool = block_pool @@ -196,6 +199,7 @@ class SingleTypeKVCacheManager(ABC): block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + dcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: """ Get the longest cache hit prefix of the blocks that is not longer than @@ -253,6 +257,7 @@ class FullAttentionManager(SingleTypeKVCacheManager): block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + dcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance( kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec) @@ -260,7 +265,10 @@ class FullAttentionManager(SingleTypeKVCacheManager): "and chunked local attention groups" computed_blocks: tuple[list[KVCacheBlock], ...] = tuple( [] for _ in range(len(kv_cache_group_ids))) - max_num_blocks = max_length // kv_cache_spec.block_size + block_size = kv_cache_spec.block_size + if dcp_world_size > 1: + block_size *= dcp_world_size + max_num_blocks = max_length // block_size for block_hash in itertools.islice(block_hashes, max_num_blocks): # block_hashes is a chain of block hashes. If a block hash is not # in the cached_block_hash_to_id, the following block hashes are @@ -310,9 +318,11 @@ class SlidingWindowManager(SingleTypeKVCacheManager): block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + dcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, SlidingWindowSpec), ( "SlidingWindowManager can only be used for sliding window groups") + assert dcp_world_size == 1, "DCP not support sliding window attn now." # The number of contiguous blocks needed for prefix cache hit. # -1 since the input token itself is also included in the window @@ -408,6 +418,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + dcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: """ For chunked local attention, we need to find the longest cache hit @@ -445,6 +456,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): "chunked local attention groups") assert use_eagle is False, ("Hybrid KV cache is not supported for " + "eagle + chunked local attention.") + assert dcp_world_size == 1, "DCP not support chunked local attn now." max_num_blocks = max_length // kv_cache_spec.block_size if max_length > 0: local_attention_start_idx = (max_length // @@ -525,10 +537,12 @@ class MambaManager(SingleTypeKVCacheManager): block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + dcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance( kv_cache_spec, MambaSpec), ("MambaManager can only be used for mamba groups") + assert dcp_world_size == 1, "DCP not support mamba now." # Prefix caching is not supported for mamba now. Always return empty # list. computed_blocks: tuple[list[KVCacheBlock], ...] = tuple( @@ -583,6 +597,7 @@ class CrossAttentionManager(SingleTypeKVCacheManager): block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + dcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, CrossAttentionSpec), ( "CrossAttentionManager can only be used for cross-attention groups" diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index a3e4d393e4d20..6467fcfe40aef 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -86,6 +86,12 @@ class FullAttentionSpec(AttentionSpec): def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len + dcp_world_size = \ + vllm_config.parallel_config.decode_context_parallel_size + # Note(hc): each dcp rank only need save + # (max_model_len//dcp_world_size) tokens locally. + if dcp_world_size > 1: + max_model_len = cdiv(max_model_len, dcp_world_size) return cdiv(max_model_len, self.block_size) * self.page_size_bytes @classmethod @@ -162,6 +168,8 @@ class SlidingWindowSpec(AttentionSpec): assert not self.use_mla, "MLA is not supported for sliding window" def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: + assert vllm_config.parallel_config.decode_context_parallel_size == 1, \ + "DCP not support sliding window." max_model_len = vllm_config.model_config.max_model_len max_num_batched_tokens = ( vllm_config.scheduler_config.max_num_batched_tokens) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 6ab5ce2748a4a..c5902595a496b 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -4,6 +4,7 @@ import numpy as np import torch +from vllm.distributed import get_dcp_group from vllm.logger import init_logger from vllm.utils import cdiv @@ -50,6 +51,13 @@ class BlockTable: self.slot_mapping = torch.zeros(self.max_num_batched_tokens, dtype=torch.int64, device=self.device) + try: + self.dcp_world_size = get_dcp_group().world_size + self.dcp_rank = get_dcp_group().rank_in_group + except AssertionError: + # DCP might not be initialized in testing + self.dcp_world_size = 1 + self.dcp_rank = 0 def append_row( self, @@ -89,13 +97,36 @@ class BlockTable: # NOTE(woosuk): We can't simply use `token_indices // block_size` # here because M (max_model_len) is not necessarily divisible by # block_size. - block_table_indices = (req_indices * self.max_num_blocks_per_req + - positions // self.block_size) - block_numbers = self.block_table_np.ravel()[block_table_indices] - block_offsets = positions % self.block_size - np.add(block_numbers * self.block_size, - block_offsets, - out=self.slot_mapping_np[:req_indices.shape[0]]) + if self.dcp_world_size > 1: + # Note(hc): The DCP implement store kvcache with a interleave + # style, the kvcache for the token whose token_idx is i is + # always stored on the GPU whose dcp_rank equals i % cp_world_size: + + # Use a "virtual block" which equals to world_size * block_size + # for block_table_indices calculation. + virtual_block_size = self.block_size * self.dcp_world_size + block_table_indices = (req_indices * self.max_num_blocks_per_req + + positions // virtual_block_size) + block_numbers = self.block_table_np.ravel()[block_table_indices] + # Use virtual_block_size for mask calculation, which marks local + # tokens. + virtual_block_offsets = positions % virtual_block_size + mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank + # Calcuate local block_offsets + block_offsets = virtual_block_offsets // self.dcp_world_size + # Calcuate slot_mapping + slot_mapping = block_numbers * self.block_size + block_offsets + # Write final slots, use -1 for not-local + self.slot_mapping_np[:req_indices.shape[0]] = np.where( + mask, slot_mapping, -1) + else: + block_table_indices = (req_indices * self.max_num_blocks_per_req + + positions // self.block_size) + block_numbers = self.block_table_np.ravel()[block_table_indices] + block_offsets = positions % self.block_size + np.add(block_numbers * self.block_size, + block_offsets, + out=self.slot_mapping_np[:req_indices.shape[0]]) def commit_block_table(self, num_reqs: int) -> None: self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs], @@ -128,9 +159,19 @@ class MultiGroupBlockTable: def __init__(self, max_num_reqs: int, max_model_len: int, max_num_batched_tokens: int, pin_memory: bool, device: torch.device, block_sizes: list[int]) -> None: + # Note(hc): each dcp rank only store + # (max_model_len//dcp_world_size) tokens in kvcache, + # so the block_size which used for calc max_num_blocks_per_req + # must be multiplied by dcp_world_size. + try: + dcp_world_size = get_dcp_group().world_size + except AssertionError: + # DCP might not be initialized in testing + dcp_world_size = 1 + self.block_tables = [ - BlockTable(block_size, max_num_reqs, cdiv(max_model_len, - block_size), + BlockTable(block_size, max_num_reqs, + cdiv(max_model_len, block_size * dcp_world_size), max_num_batched_tokens, pin_memory, device) for block_size in block_sizes ] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5bee2dff98329..ba909f5e81b4f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -56,6 +56,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, LazyLoader, cdiv, check_use_alibi, get_dtype_size, is_pin_memory_available, round_up, supports_dynamo) +from vllm.v1.attention.backends.mla.flashmla import FlashMLABackend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, create_fast_prefill_custom_backend, @@ -187,6 +188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model_config.is_multimodal_raw_input_only_model) self.max_model_len = model_config.max_model_len + self.dcp_world_size = self.parallel_config.decode_context_parallel_size self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs @@ -428,6 +430,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return if self.reorder_batch_threshold is not None: + if self.dcp_world_size > 1: + assert self.reorder_batch_threshold == 1, \ + "DCP not support reorder_batch_threshold > 1 now." reorder_batch_to_split_decodes_and_prefills( self.input_batch, scheduler_output, @@ -3305,6 +3310,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): get_kv_transfer_group().set_host_xfer_buffer_ops( copy_kv_blocks) + if self.dcp_world_size > 1: + assert self.attn_groups[0][0].backend is FlashMLABackend, ( + "DCP only support flashmla now." + "For a mla backend want to enable DCP, it is mandatory that the" + "corresponding decode attn kernel return the softmax lse.") + def may_add_encoder_only_layers_to_kv_cache_config(self) -> None: """ Add encoder-only layers to the KV cache config. diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 99c805a3e9496..6a3bc5d46df27 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -616,7 +616,9 @@ def init_worker_distributed_environment( init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank, backend) - ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + ensure_model_parallel_initialized( + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size, + parallel_config.decode_context_parallel_size) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 08bb4e7c9e479..b4a67e2899d0d 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -539,8 +539,10 @@ def init_worker_distributed_environment( init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank, current_platform.dist_backend) - ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + ensure_model_parallel_initialized( + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size, + parallel_config.decode_context_parallel_size) ensure_kv_transfer_initialized(vllm_config) From 6432739ef1b35be382733c6c081dab76696b1f96 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 5 Sep 2025 22:30:22 -0700 Subject: [PATCH 12/23] [Bugfix] Catch and log invalid token ids in detokenizer (#24351) Signed-off-by: Nick Hill --- vllm/v1/engine/detokenizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 0ccbe65493499..38f435f5166e0 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -233,6 +233,11 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): def _protected_step(self, next_token_id: int) -> Optional[str]: try: token = self.stream.step(self.tokenizer, next_token_id) + except OverflowError: + # Handle rare observed overflow, still to be diagnosed. + # See https://github.com/vllm-project/vllm/issues/21951. + logger.exception("Encountered invalid token id: %d", next_token_id) + token = None except Exception as e: if not str(e).startswith(INVALID_PREFIX_ERR_MSG): raise e From 53b19ccdd5a04f442283a9b077b78da3b0df79d4 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 6 Sep 2025 13:53:58 +0800 Subject: [PATCH 13/23] [Core] Allow disabling TP sharding for parallel Linear layer (#23024) Signed-off-by: Isotr0py Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/linear.py | 175 +++++++----------- .../model_loader/bitsandbytes_loader.py | 23 ++- vllm/model_executor/models/deepseek_v2.py | 6 +- vllm/model_executor/models/glm4_1v.py | 128 +++++-------- vllm/model_executor/models/qwen2_5_vl.py | 58 +++--- vllm/model_executor/models/step3_vl.py | 71 +++---- vllm/model_executor/parameter.py | 22 +-- 7 files changed, 203 insertions(+), 280 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 1224b94d56e06..fa8a261db7d7d 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -223,6 +223,7 @@ class LinearBase(CustomOp): quant_config: Quantization configure. prefix: Prefix for parameter names. return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, tensor parallelism will be disabled for this layer. """ def __init__( @@ -235,6 +236,7 @@ class LinearBase(CustomOp): prefix: str = "", *, return_bias: bool = True, + disable_tp: bool = False, ): super().__init__() @@ -254,6 +256,17 @@ class LinearBase(CustomOp): self.quant_method = quant_config.get_quant_method(self, prefix=prefix) self.return_bias = return_bias + self.disable_tp = disable_tp + self.tp_rank = (get_tensor_model_parallel_rank() + if not disable_tp else 0) + self.tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) + + def __post_init__(self): + for param in self.parameters(): + if isinstance(param, BasevLLMParameter): + param.tp_rank = self.tp_rank + param.tp_size = self.tp_size @CustomOp.register("replicated_linear") @@ -270,6 +283,7 @@ class ReplicatedLinear(LinearBase): prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) return_bias: If true, return bias together with outputs in forward pass. + disable_tp: Take no effect for replicated linear layers. """ def __init__( @@ -283,26 +297,21 @@ class ReplicatedLinear(LinearBase): prefix: str = "", *, return_bias: bool = True, + disable_tp: bool = False, ): - # If MergedReplicatedLinear, use output size of each partition. - if hasattr(self, "output_sizes"): - self.output_partition_sizes = self.output_sizes - else: - self.output_partition_sizes = [output_size] - super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix=prefix, - return_bias=return_bias) + return_bias=return_bias, + disable_tp=disable_tp) # All the linear layer supports quant method. assert self.quant_method is not None self.quant_method.create_weights(self, - self.input_size, - self.output_partition_sizes, + self.input_size, [self.output_size], self.input_size, self.output_size, self.params_dtype, @@ -358,74 +367,6 @@ class ReplicatedLinear(LinearBase): return s -class MergedReplicatedLinear(ReplicatedLinear): - """Replicated linear layer. - - Args: - input_size: input dimension of the linear layer. - output_sizes: list of output dimensions of the linear layer. - bias: If true, add bias. - skip_bias_add: If true, skip adding bias but instead return it. - params_dtype: Data type for the parameters. - quant_config: Quantization configure. - prefix: The name of the layer in the state dict, including all parents - (e.g. model.layers.0.qkv_proj) - return_bias: If true, return bias together with outputs in forward pass. - """ - - def __init__( - self, - input_size: int, - output_sizes: list[int], - bias: bool = True, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - *, - return_bias: bool = True, - ): - self.output_sizes = output_sizes - super().__init__(input_size, - sum(output_sizes), - bias, - skip_bias_add, - params_dtype, - quant_config, - prefix=prefix, - return_bias=return_bias) - - def weight_loader(self, - param: Union[Parameter, BasevLLMParameter], - loaded_weight: torch.Tensor, - loaded_shard_id: Optional[int] = None): - assert loaded_shard_id is not None - assert loaded_shard_id < len(self.output_sizes) - - if isinstance(param, BlockQuantScaleParameter): - from vllm.model_executor.layers.quantization.fp8 import ( - Fp8LinearMethod, Fp8MoEMethod) - assert self.quant_method is not None - assert isinstance(self.quant_method, - (Fp8LinearMethod, Fp8MoEMethod)) - weight_block_size = self.quant_method.quant_config.weight_block_size - assert weight_block_size is not None - block_n, _ = weight_block_size[0], weight_block_size[1] - shard_offset = ( - (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // - block_n) - shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) // - block_n) - elif isinstance(param, PerTensorScaleParameter): - shard_offset = loaded_shard_id - shard_size = 1 - else: - shard_offset = sum(self.output_sizes[:loaded_shard_id]) - shard_size = self.output_sizes[loaded_shard_id] - - param.data[shard_offset:shard_offset + shard_size] = loaded_weight - - @CustomOp.register("column_parallel_linear") class ColumnParallelLinear(LinearBase): """Linear layer with column parallelism. @@ -448,7 +389,9 @@ class ColumnParallelLinear(LinearBase): output_sizes: list of output sizes packed into one output, like for QKV the list would be size 3. prefix: The name of the layer in the state dict, including all parents - (e.g. model.layers.0.qkv_proj) + (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, weights matrix won't be sharded through tp rank. """ def __init__( @@ -464,9 +407,13 @@ class ColumnParallelLinear(LinearBase): prefix: str = "", *, return_bias: bool = True, + disable_tp: bool = False, ): # Divide the weight matrix along the last dimension. - self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = (get_tensor_model_parallel_rank() + if not disable_tp else 0) + self.tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) self.input_size_per_partition = input_size self.output_size_per_partition = divide(output_size, self.tp_size) self.output_partition_sizes = [self.output_size_per_partition] @@ -483,7 +430,8 @@ class ColumnParallelLinear(LinearBase): params_dtype, quant_config, prefix, - return_bias=return_bias) + return_bias=return_bias, + disable_tp=disable_tp) self.gather_output = gather_output @@ -512,8 +460,6 @@ class ColumnParallelLinear(LinearBase): else: self.register_parameter("bias", None) - self.tp_rank = get_tensor_model_parallel_rank() - def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): output_dim = getattr(param, "output_dim", None) @@ -554,7 +500,8 @@ class ColumnParallelLinear(LinearBase): assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) - def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor): + def weight_loader_v2(self, param: BasevLLMParameter, + loaded_weight: torch.Tensor): # Special case for loading scales off disk, which often do not # have a shape (such as in the case of AutoFP8). if len(loaded_weight.shape) == 0: @@ -570,7 +517,7 @@ class ColumnParallelLinear(LinearBase): # Matrix multiply. assert self.quant_method is not None output_parallel = self.quant_method.apply(self, input_, bias) - if self.gather_output: + if self.gather_output and self.tp_size > 1: # All-gather across the partitions. output = tensor_model_parallel_all_gather(output_parallel) else: @@ -584,7 +531,7 @@ class ColumnParallelLinear(LinearBase): s = f"in_features={self.input_size}" s += f", output_features={self.output_size_per_partition}" s += f", bias={self.bias is not None}" - s += f", tp_size={get_tensor_model_parallel_world_size()}" + s += f", tp_size={self.tp_size}" s += f", gather_output={self.gather_output}" return s @@ -611,6 +558,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear): prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, all weights matrix won't be sharded, this layer + will be treated as a "Replicated" MergedLinear. """ def __init__( @@ -625,10 +574,13 @@ class MergedColumnParallelLinear(ColumnParallelLinear): prefix: str = "", *, return_bias: bool = True, + disable_tp: bool = False, ): self.output_sizes = output_sizes - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) + self.tp_rank = (get_tensor_model_parallel_rank() + if not disable_tp else 0) assert all(output_size % self.tp_size == 0 for output_size in output_sizes) @@ -640,7 +592,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear): params_dtype=params_dtype, quant_config=quant_config, prefix=prefix, - return_bias=return_bias) + return_bias=return_bias, + disable_tp=disable_tp) def weight_loader(self, param: Parameter, @@ -832,8 +785,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear): assert loaded_shard_id < len(self.output_sizes) - tp_size = get_tensor_model_parallel_world_size() - if isinstance(param, BlockQuantScaleParameter): from vllm.model_executor.layers.quantization.fp8 import ( Fp8LinearMethod, Fp8MoEMethod) @@ -845,17 +796,19 @@ class MergedColumnParallelLinear(ColumnParallelLinear): block_n, _ = weight_block_size[0], weight_block_size[1] shard_offset = ( (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // - block_n) // tp_size + block_n) // self.tp_size shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) // - block_n // tp_size) + block_n // self.tp_size) else: - shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size - shard_size = self.output_sizes[loaded_shard_id] // tp_size + shard_offset = sum( + self.output_sizes[:loaded_shard_id]) // self.tp_size + shard_size = self.output_sizes[loaded_shard_id] // self.tp_size param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=loaded_shard_id, shard_offset=shard_offset, - shard_size=shard_size) + shard_size=shard_size, + tp_rank=self.tp_rank) class QKVParallelLinear(ColumnParallelLinear): @@ -883,6 +836,7 @@ class QKVParallelLinear(ColumnParallelLinear): prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, weights matrix won't be sharded through tp rank. """ def __init__( @@ -898,6 +852,7 @@ class QKVParallelLinear(ColumnParallelLinear): prefix: str = "", *, return_bias: bool = True, + disable_tp: bool = False, ): self.hidden_size = hidden_size self.head_size = head_size @@ -906,7 +861,8 @@ class QKVParallelLinear(ColumnParallelLinear): total_num_kv_heads = total_num_heads self.total_num_kv_heads = total_num_kv_heads # Divide the weight matrix along the last dimension. - tp_size = get_tensor_model_parallel_world_size() + tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) self.num_heads = divide(self.total_num_heads, tp_size) if tp_size >= self.total_num_kv_heads: self.num_kv_heads = 1 @@ -932,7 +888,8 @@ class QKVParallelLinear(ColumnParallelLinear): params_dtype=params_dtype, quant_config=quant_config, prefix=prefix, - return_bias=return_bias) + return_bias=return_bias, + disable_tp=disable_tp) def _get_shard_offset_mapping(self, loaded_shard_id: str): shard_offset_mapping = { @@ -993,10 +950,13 @@ class QKVParallelLinear(ColumnParallelLinear): loaded_shard_id: Optional[str] = None): if loaded_shard_id is None: # special case for certain models if isinstance(param, PerTensorScaleParameter): - param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0) + param.load_qkv_weight(loaded_weight=loaded_weight, + shard_id=0, + tp_rank=self.tp_rank) return elif type(param) in (RowvLLMParameter, BasevLLMParameter): - param.load_qkv_weight(loaded_weight=loaded_weight) + param.load_qkv_weight(loaded_weight=loaded_weight, + tp_rank=self.tp_rank) return # TODO: @dsikka - move to parameter.py self._load_fused_module_from_checkpoint(param, loaded_weight) @@ -1020,7 +980,8 @@ class QKVParallelLinear(ColumnParallelLinear): num_heads=self.num_kv_head_replicas, shard_id=loaded_shard_id, shard_offset=shard_offset, - shard_size=shard_size) + shard_size=shard_size, + tp_rank=self.tp_rank) def weight_loader(self, param: Parameter, @@ -1226,6 +1187,7 @@ class RowParallelLinear(LinearBase): prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.down_proj) return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, weights matrix won't be sharded through tp rank. """ def __init__( @@ -1241,10 +1203,13 @@ class RowParallelLinear(LinearBase): prefix: str = "", *, return_bias: bool = True, + disable_tp: bool = False, ): # Divide the weight matrix along the first dimension. - self.tp_rank = get_tensor_model_parallel_rank() - self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = (get_tensor_model_parallel_rank() + if not disable_tp else 0) + self.tp_size = (get_tensor_model_parallel_world_size() + if not disable_tp else 1) self.input_size_per_partition = divide(input_size, self.tp_size) self.output_size_per_partition = output_size self.output_partition_sizes = [output_size] @@ -1255,7 +1220,8 @@ class RowParallelLinear(LinearBase): params_dtype, quant_config, prefix, - return_bias=return_bias) + return_bias=return_bias, + disable_tp=disable_tp) self.input_is_parallel = input_is_parallel self.reduce_results = reduce_results @@ -1339,10 +1305,9 @@ class RowParallelLinear(LinearBase): if self.input_is_parallel: input_parallel = input_ else: - tp_rank = get_tensor_model_parallel_rank() splitted_input = split_tensor_along_last_dim( input_, num_partitions=self.tp_size) - input_parallel = splitted_input[tp_rank].contiguous() + input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. assert self.quant_method is not None diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index b8393956eed3f..c8dd1ec0ec3c6 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -69,6 +69,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Store all module names (from transformers) that support # BNB quantization. self.target_modules: list[str] = [] + self.tp_disabled_modules: list[str] = [] # Store the mapping of expert parameters for MoE models. self.expert_params_mapping: list[tuple[str, str, int, str]] = [] # mapping weight names from transformers to vllm. @@ -322,14 +323,24 @@ class BitsAndBytesModelLoader(BaseModelLoader): quant_state_dict) -> Generator: from bitsandbytes.functional import quantize_4bit - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() + global_tp_size = get_tensor_model_parallel_world_size() + global_tp_rank = get_tensor_model_parallel_rank() for ( org_weight_name, mapped_weight_name, weight_tensor, ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + + # override tp_size and tp_rank if the module has disabled TP + if any(tp_disabled_module in mapped_weight_name + for tp_disabled_module in self.tp_disabled_modules): + tp_size = 1 + tp_rank = 0 + else: + tp_size = global_tp_size + tp_rank = global_tp_rank + if any(target_module in mapped_weight_name for target_module in self.target_modules ) and mapped_weight_name.endswith(".weight"): @@ -418,12 +429,16 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Map vllm's names to transformers's names. rep_name, sub_modules = modules_info for sub_name in sub_modules: - self.target_modules.append( - name.replace(rep_name, sub_name)) + new_name = name.replace(rep_name, sub_name) + self.target_modules.append(new_name) + if module.disable_tp: + self.tp_disabled_modules.append(new_name) # Add original module name even if the module has stacked map, # in case model has a mixture of disk-merged and disk-split # weights with same last name. self.target_modules.append(name) + if module.disable_tp: + self.tp_disabled_modules.append(name) elif isinstance(module, FusedMoE) and hasattr( module.quant_method, "quant_config"): # TODO: support FusedMoE with prequant and 8bit. diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index bb95a1dbf122e..d65dcfebaeff8 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -43,7 +43,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, - MergedReplicatedLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -435,12 +434,13 @@ class DeepseekV2MLAAttention(nn.Module): self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: - self.fused_qkv_a_proj = MergedReplicatedLinear( + self.fused_qkv_a_proj = MergedColumnParallelLinear( self.hidden_size, [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], bias=False, quant_config=quant_config, - prefix=f"{prefix}.fused_qkv_a_proj") + prefix=f"{prefix}.fused_qkv_a_proj", + disable_tp=True) else: self.kv_a_proj_with_mqa = ReplicatedLinear( self.hidden_size, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index f9fd5163d66b4..fd5fecac67d67 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -51,14 +51,10 @@ from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.layernorm import RMSNorm -# yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, - MergedReplicatedLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) -# yapf: enable from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -174,20 +170,22 @@ class Glm4vVisionMLP(nn.Module): use_data_parallel: bool = False, ): super().__init__() - cls_gate_up = (MergedReplicatedLinear - if use_data_parallel else MergedColumnParallelLinear) - self.gate_up_proj = cls_gate_up(input_size=in_features, - output_sizes=[hidden_features] * 2, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - cls_down = (ReplicatedLinear - if use_data_parallel else RowParallelLinear) - self.down_proj = cls_down(hidden_features, - in_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.down_proj") + self.gate_up_proj = MergedColumnParallelLinear( + input_size=in_features, + output_sizes=[hidden_features] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + disable_tp=use_data_parallel, + ) + self.down_proj = RowParallelLinear( + hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + disable_tp=use_data_parallel, + ) self.act_fn = SiluAndMul() def forward(self, x: torch.Tensor): @@ -234,48 +232,32 @@ class Glm4vVisionAttention(nn.Module): # Per attention head and per partition values. self.tp_size = (1 if use_data_parallel else get_tensor_model_parallel_world_size()) - self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.tp_rank = (0 if use_data_parallel else + parallel_state.get_tensor_model_parallel_rank()) self.hidden_size_per_attention_head = dist_utils.divide( projection_size, num_heads) self.num_attention_heads_per_partition = dist_utils.divide( num_heads, self.tp_size) - if use_data_parallel: - self.qkv = ReplicatedLinear( - input_size=embed_dim, - output_size=3 * projection_size, - bias=False, - quant_config=quant_config, - # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg - prefix=f"{prefix}.qkv_proj" - if quant_config else f"{prefix}.qkv", - ) - self.proj = ReplicatedLinear( - input_size=projection_size, - output_size=embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - bias=False, - ) - else: - self.qkv = QKVParallelLinear( - hidden_size=embed_dim, - head_size=self.hidden_size_per_attention_head, - total_num_heads=num_heads, - total_num_kv_heads=num_heads, - bias=False, - quant_config=quant_config, - # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg - prefix=f"{prefix}.qkv_proj" - if quant_config else f"{prefix}.qkv", - ) - self.proj = RowParallelLinear( - input_size=projection_size, - output_size=embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - bias=False, - ) + self.qkv = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.hidden_size_per_attention_head, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + bias=False, + quant_config=quant_config, + # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg + prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv", + disable_tp=use_data_parallel, + ) + self.proj = RowParallelLinear( + input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + bias=False, + disable_tp=use_data_parallel, + ) # Detect attention implementation. self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) @@ -494,41 +476,31 @@ class Glm4vPatchMerger(nn.Module): ) -> None: super().__init__() self.hidden_size = d_model - if use_data_parallel: - self.proj = ReplicatedLinear( - input_size=self.hidden_size, - output_size=self.hidden_size, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) - else: - self.proj = ColumnParallelLinear( - self.hidden_size, - self.hidden_size, - bias=bias, - gather_output=True, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) + self.proj = ColumnParallelLinear( + self.hidden_size, + self.hidden_size, + bias=bias, + gather_output=True, + quant_config=quant_config, + prefix=f"{prefix}.proj", + disable_tp=use_data_parallel, + ) self.post_projection_norm = nn.LayerNorm(self.hidden_size) - cls_gate_up = (MergedReplicatedLinear - if use_data_parallel else MergedColumnParallelLinear) - self.gate_up_proj = cls_gate_up( + self.gate_up_proj = MergedColumnParallelLinear( input_size=self.hidden_size, output_sizes=[context_dim] * 2, bias=bias, quant_config=quant_config, prefix=f"{prefix}.gate_up_proj", + disable_tp=use_data_parallel, ) - cls_down = (ReplicatedLinear - if use_data_parallel else RowParallelLinear) - self.down_proj = cls_down( + self.down_proj = RowParallelLinear( context_dim, self.hidden_size, bias=bias, quant_config=quant_config, prefix=f"{prefix}.down_proj", + disable_tp=use_data_parallel, ) self.act_fn = SiluAndMul() self.extra_activation_func = nn.GELU() diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index c8f7fc16b4e83..0a89f86fc7389 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm # yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, - MergedReplicatedLinear, QKVParallelLinear, ReplicatedLinear, RowParallelLinear) @@ -178,22 +177,20 @@ class Qwen2_5_VisionMLP(nn.Module): prefix: str = "", use_data_parallel: bool = False): super().__init__() - cls_gate_up_proj = (MergedReplicatedLinear if use_data_parallel else - MergedColumnParallelLinear) - self.gate_up_proj = cls_gate_up_proj( + self.gate_up_proj = MergedColumnParallelLinear( input_size=in_features, output_sizes=[hidden_features] * 2, # [gate_proj, up_proj] bias=bias, quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") + prefix=f"{prefix}.gate_up_proj", + disable_tp=use_data_parallel) - cls_down_proj = (ReplicatedLinear - if use_data_parallel else RowParallelLinear) - self.down_proj = cls_down_proj(hidden_features, - in_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.down_proj") + self.down_proj = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + disable_tp=use_data_parallel) self.act_fn = act_fn def forward(self, x: torch.Tensor): @@ -243,30 +240,21 @@ class Qwen2_5_VisionAttention(nn.Module): self.num_attention_heads_per_partition = dist_utils.divide( num_heads, self.tp_size) - if use_data_parallel: - self.qkv = ReplicatedLinear(embed_dim, - self.hidden_size_per_attention_head * - 3 * num_heads, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.qkv") + self.qkv = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.hidden_size_per_attention_head, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + disable_tp=use_data_parallel) - else: - self.qkv = QKVParallelLinear( - hidden_size=embed_dim, - head_size=self.hidden_size_per_attention_head, - total_num_heads=num_heads, - total_num_kv_heads=num_heads, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.qkv") - - cls_proj = (ReplicatedLinear - if use_data_parallel else RowParallelLinear) - self.proj = cls_proj(input_size=projection_size, - output_size=embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj") + self.proj = RowParallelLinear(input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + disable_tp=use_data_parallel) # Detect attention implementation. self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index f379d2c15fb6c..17299b64978e3 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -21,7 +21,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -667,35 +666,21 @@ class Step3VisionAttention(nn.Module): self.q_size = self.num_heads * self.head_dim - if use_data_parallel: - self.qkv_proj = ReplicatedLinear( - self.embed_dim, - 3 * self.q_size, - bias=True, - quant_config=quant_config, - prefix=prefix, - ) - self.out_proj = ReplicatedLinear( - self.total_num_heads * self.head_dim, - self.embed_dim, - bias=True, - quant_config=quant_config, - prefix=prefix, - ) - else: - self.qkv_proj = QKVParallelLinear( - self.embed_dim, - self.head_dim, - self.total_num_heads, - bias=True, - quant_config=quant_config, - prefix=prefix, - ) - self.out_proj = RowParallelLinear(self.embed_dim, - self.embed_dim, - bias=True, - quant_config=quant_config, - prefix=prefix) + self.qkv_proj = QKVParallelLinear( + self.embed_dim, + self.head_dim, + self.total_num_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + disable_tp=use_data_parallel, + ) + self.out_proj = RowParallelLinear(self.embed_dim, + self.embed_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + disable_tp=use_data_parallel) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, @@ -740,20 +725,18 @@ class Step3VisionMLP(nn.Module): super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) - cls_fc1 = (ReplicatedLinear - if use_data_parallel else ColumnParallelLinear) - self.fc1 = cls_fc1(config.hidden_size, - config.intermediate_size, - bias=True, - quant_config=quant_config, - prefix=prefix) - cls_fc2 = (ReplicatedLinear - if use_data_parallel else RowParallelLinear) - self.fc2 = cls_fc2(config.intermediate_size, - config.hidden_size, - bias=True, - quant_config=quant_config, - prefix=prefix) + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc1", + disable_tp=use_data_parallel) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + disable_tp=use_data_parallel) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 9465308e94e65..221712ba9a338 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -57,6 +57,8 @@ class BasevLLMParameter(Parameter): weight_loader = _make_synced_weight_loader(weight_loader) self._weight_loader = weight_loader + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() @property def weight_loader(self): @@ -116,10 +118,10 @@ class _ColumnvLLMParameter(BasevLLMParameter): return self._output_dim def load_column_parallel_weight(self, loaded_weight: torch.Tensor): - tp_rank = get_tensor_model_parallel_rank() shard_size = self.data.shape[self.output_dim] loaded_weight = loaded_weight.narrow(self.output_dim, - tp_rank * shard_size, shard_size) + self.tp_rank * shard_size, + shard_size) assert self.data.shape == loaded_weight.shape self.data.copy_(loaded_weight) @@ -127,6 +129,7 @@ class _ColumnvLLMParameter(BasevLLMParameter): shard_offset = kwargs.get("shard_offset") shard_size = kwargs.get("shard_size") + # TODO: move these to PackedColumnParameter and PackedvLLMParameter if isinstance( self, @@ -137,11 +140,11 @@ class _ColumnvLLMParameter(BasevLLMParameter): param_data = self.data - tp_rank = get_tensor_model_parallel_rank() param_data = param_data.narrow(self.output_dim, shard_offset, shard_size) loaded_weight = loaded_weight.narrow(self.output_dim, - tp_rank * shard_size, shard_size) + self.tp_rank * shard_size, + shard_size) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -161,8 +164,8 @@ class _ColumnvLLMParameter(BasevLLMParameter): shard_offset=shard_offset, shard_size=shard_size) param_data = self.data - tp_rank = get_tensor_model_parallel_rank() - shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads + shard_id = (self.tp_rank if shard_id == "q" else self.tp_rank // + num_heads) param_data = param_data.narrow(self.output_dim, shard_offset, shard_size) loaded_weight = loaded_weight.narrow(self.output_dim, @@ -189,10 +192,10 @@ class RowvLLMParameter(BasevLLMParameter): return self._input_dim def load_row_parallel_weight(self, loaded_weight: torch.Tensor): - tp_rank = get_tensor_model_parallel_rank() shard_size = self.data.shape[self.input_dim] loaded_weight = loaded_weight.narrow(self.input_dim, - tp_rank * shard_size, shard_size) + self.tp_rank * shard_size, + shard_size) if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) @@ -414,9 +417,6 @@ class SharedWeightParameter(BasevLLMParameter): "weight_loader": self._fake_weight_loader } - self.tp_rank = get_tensor_model_parallel_rank() - self.tp_size = get_tensor_model_parallel_world_size() - if self.tp_size > 1: raise NotImplementedError(f"{self.__class__.__name__} does not " "currently support tensor parallelism") From 6d6c6b05d37f12abe18cffdd9ade9f1fab864749 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Sat, 6 Sep 2025 13:58:36 +0800 Subject: [PATCH 14/23] [New Model]: google/embeddinggemma-300m (#24318) Signed-off-by: wang.yuqi --- docs/models/supported_models.md | 1 + tests/models/language/pooling/mteb_utils.py | 18 +++++++++-- .../language/pooling/test_st_projector.py | 7 +++- tests/models/registry.py | 1 + vllm/config/__init__.py | 2 ++ vllm/model_executor/models/adapters.py | 32 ++++++++++--------- vllm/model_executor/models/config.py | 9 ++++++ vllm/model_executor/models/gemma3.py | 31 +++++++++++------- vllm/model_executor/models/registry.py | 1 + 9 files changed, 73 insertions(+), 29 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 9db6f8036a73b..bdb29aac333c1 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -440,6 +440,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ | | `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Gemma3TextModel`C | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | | `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | ✅︎ | | `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | ✅︎ | diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 7be1bba2ff69f..68b1cc80303ad 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -10,7 +10,8 @@ import numpy as np import pytest import requests -from tests.models.utils import EmbedModelInfo, RerankModelInfo +from tests.models.utils import (EmbedModelInfo, RerankModelInfo, + check_embeddings_close) # Most embedding models on the STS12 task (See #17175): # - Model implementation and minor changes in tensor dtype @@ -163,12 +164,14 @@ def mteb_test_embed_models(hf_runner, model_info: EmbedModelInfo, vllm_extra_kwargs=None, hf_model_callback=None, - atol=MTEB_RERANK_TOL): + atol=MTEB_EMBED_TOL): if not model_info.enable_test: # A model family has many models with the same architecture, # and we don't need to test each one. pytest.skip("Skipping test.") + example_prompts = ["The chef prepared a delicious meal."] + vllm_extra_kwargs = vllm_extra_kwargs or {} vllm_extra_kwargs["dtype"] = model_info.dtype @@ -191,6 +194,7 @@ def mteb_test_embed_models(hf_runner, vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype + vllm_outputs = vllm_model.embed(example_prompts) if model_info.mteb_score is None: with hf_runner(model_info.name, @@ -202,6 +206,16 @@ def mteb_test_embed_models(hf_runner, st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) st_dtype = next(hf_model.model.parameters()).dtype + + # Test embed_dims and whether to use normalize + hf_outputs = hf_model.encode(example_prompts) + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) else: st_main_score = model_info.mteb_score st_dtype = "Constant" diff --git a/tests/models/language/pooling/test_st_projector.py b/tests/models/language/pooling/test_st_projector.py index bafeb4060d80a..9301e705c4335 100644 --- a/tests/models/language/pooling/test_st_projector.py +++ b/tests/models/language/pooling/test_st_projector.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo) from .mteb_utils import mteb_test_embed_models # ST models with projector (Dense) layers @@ -13,6 +14,10 @@ ST_PROJECTOR_MODELS = [ mteb_score=0.688611955, enable_test=True, ), + LASTPoolingEmbedModelInfo("google/embeddinggemma-300m", + architecture="Gemma3TextModel", + mteb_score=0.7473819294684156, + enable_test=True) ] diff --git a/tests/models/registry.py b/tests/models/registry.py index 38efb01341ebe..c6ff50b5426e1 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -352,6 +352,7 @@ _EMBEDDING_EXAMPLE_MODELS = { # [Text-only] "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), # noqa: E501 + "Gemma3TextModel": _HfExamplesInfo("google/embeddinggemma-300m"), "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"), "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0", trust_remote_code=True), diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 8bdc22acf380e..c4434c37f4c76 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2750,6 +2750,8 @@ _STR_DTYPE_TO_TORCH_DTYPE = { _FLOAT16_NOT_SUPPORTED_MODELS = { "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.", "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.", + "gemma3_text": + "Numerical instability. Please use bfloat16 or float32 instead.", "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.", "glm4": "Numerical instability. Please use bfloat16 or float32 instead.", } diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 50c2cd97f3d09..bb96bc559200c 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -49,26 +49,28 @@ def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]: if not dense_modules: return None - module = dense_modules[0] - folder = module.get("path", "") + layers = [] + for module in dense_modules: + folder = module.get("path", "") - config_path = f"{folder}/config.json" if folder else "config.json" - layer_config = get_hf_file_to_dict(config_path, model_config.model, - model_config.revision) - if not layer_config: - return None + config_path = f"{folder}/config.json" if folder else "config.json" + layer_config = get_hf_file_to_dict(config_path, model_config.model, + model_config.revision) + if not layer_config: + continue - linear = nn.Linear(layer_config.get("in_features", 768), - layer_config.get("out_features", 768), - bias=layer_config.get("bias", True), - dtype=torch.float32) + linear = nn.Linear(layer_config.get("in_features", 768), + layer_config.get("out_features", 768), + bias=layer_config.get("bias", True), + dtype=torch.float32) - if _load_dense_weights(linear, folder, model_config): - layers = [linear] + if not _load_dense_weights(linear, folder, model_config): + continue + + layers.append(linear) if act_name := layer_config.get("activation_function"): layers.append(get_act_fn(act_name)) - return nn.Sequential(*layers).to(dtype=torch.float32) - + return nn.Sequential(*layers).to(dtype=torch.float32) except Exception: logger.exception("ST projector loading failed") diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 8b76a54332f81..f38e7fc202209 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -24,6 +24,14 @@ class VerifyAndUpdateConfig: raise NotImplementedError +class Gemma3TextModelConfig: + + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + hf_config = vllm_config.model_config.hf_config + hf_config.is_causal = not hf_config.use_bidirectional_attention + + class GteNewModelConfig(VerifyAndUpdateConfig): @staticmethod @@ -409,6 +417,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GteModel": SnowflakeGteNewModelConfig, "GteNewModel": GteNewModelConfig, "GteNewForSequenceClassification": GteNewModelConfig, + "Gemma3TextModel": Gemma3TextModelConfig, "NomicBertModel": NomicBertModelConfig, "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig, "Qwen2ForRewardModel": Qwen2ForRewardModelConfig, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 410c715d5241b..1263e3049a14a 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -24,7 +24,7 @@ import torch.nn.functional as F from torch import nn from transformers import Gemma3TextConfig -from vllm.attention import Attention +from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -44,6 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from ...attention.layers.encoder_only_attention import EncoderOnlyAttention from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, @@ -169,16 +170,24 @@ class Gemma3Attention(nn.Module): rope_scaling=self.rope_scaling, ) - # Initialize the attention. - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - logits_soft_cap=attn_logits_soft_cap, - per_layer_sliding_window=sliding_window, - prefix=f"{prefix}.attn") + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + + attn_cls = (EncoderOnlyAttention + if attn_type == AttentionType.ENCODER_ONLY else Attention) + + self.attn = attn_cls(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + attn_type=attn_type, + logits_soft_cap=attn_logits_soft_cap, + per_layer_sliding_window=sliding_window, + prefix=f"{prefix}.attn") def forward( self, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 38d300b03d2c4..c522fcab7f333 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -155,6 +155,7 @@ _EMBEDDING_MODELS = { "BertModel": ("bert", "BertEmbeddingModel"), "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"), "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"), + "Gemma3TextModel": ("gemma3", "Gemma3Model"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), "GPT2ForSequenceClassification": ("gpt2", "GPT2ForSequenceClassification"), "GritLM": ("gritlm", "GritLM"), From 305a1cc0d27870568c9c0b2ee6f2124479f0cb3a Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Sat, 6 Sep 2025 01:01:23 -0500 Subject: [PATCH 15/23] refactor: Turn GPUModelRunner.inputs_embeds to a CpuGpuBuffer (#24345) Signed-off-by: Andrew Sansom --- vllm/v1/utils.py | 21 +++++++++++++++++---- vllm/v1/worker/gpu_model_runner.py | 30 ++++++++++++++++++++---------- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 8f9face6fbf2e..ab9bee3e4544d 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -19,6 +19,8 @@ from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri, kill_process_tree) if TYPE_CHECKING: + import numpy as np + from vllm.v1.engine.coordinator import DPCoordinator from vllm.v1.engine.utils import (CoreEngineActorManager, CoreEngineProcManager) @@ -97,20 +99,31 @@ class ConstantList(Generic[T], Sequence): class CpuGpuBuffer: + """Buffer to easily copy tensors between CPU and GPU.""" def __init__( self, - *args, + *size: Union[int, torch.SymInt], dtype: torch.dtype, device: torch.device, pin_memory: bool, - ): - self.cpu = torch.zeros(*args, + with_numpy: bool = True, + ) -> None: + self.cpu = torch.zeros(*size, dtype=dtype, device="cpu", pin_memory=pin_memory) - self.np = self.cpu.numpy() self.gpu = self.cpu.to(device) + self.np: np.ndarray + # To keep type hints simple (avoiding generics and subclasses), we + # only conditionally create the numpy array attribute. This can cause + # AttributeError if `self.np` is accessed when `with_numpy=False`. + if with_numpy: + if dtype == torch.bfloat16: + raise ValueError( + "Bfloat16 torch tensors cannot be directly cast to a " + "numpy array, so call CpuGpuBuffer with with_numpy=False") + self.np = self.cpu.numpy() def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor: if n is None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ba909f5e81b4f..76ed5c5a6051f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -303,10 +303,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.query_start_loc = self._make_buffer(self.max_num_reqs + 1, dtype=torch.int32) self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32) - self.inputs_embeds = torch.zeros( - (self.max_num_tokens, self.hidden_size), - dtype=self.dtype, - device=self.device) + # Because inputs_embeds may be bfloat16 and we don't need a numpy + # version of this tensor, avoid a RuntimeError by not creating a + # numpy buffer. + self.inputs_embeds = self._make_buffer(self.max_num_tokens, + self.hidden_size, + dtype=self.dtype, + numpy=False) # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: @@ -374,11 +377,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): device="cpu", pin_memory=self.pin_memory) - def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer: - return CpuGpuBuffer(*args, + def _make_buffer(self, + *size: Union[int, torch.SymInt], + dtype: torch.dtype, + numpy: bool = True) -> CpuGpuBuffer: + # Bfloat16 torch tensors cannot be directly cast to a numpy array, so + # if a bfloat16 buffer is needed without a corresponding numpy array, + # don't bother instantiating the numpy array. + return CpuGpuBuffer(*size, dtype=dtype, device=self.device, - pin_memory=self.pin_memory) + pin_memory=self.pin_memory, + with_numpy=numpy) def _init_model_kwargs(self, num_tokens: int): model_kwargs = dict[str, Any]() @@ -1645,11 +1655,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) # TODO(woosuk): Avoid the copy. Optimize. - self.inputs_embeds[:num_scheduled_tokens].copy_( + self.inputs_embeds.gpu[:num_scheduled_tokens].copy_( inputs_embeds_scheduled) input_ids = None - inputs_embeds = self.inputs_embeds[:num_input_tokens] + inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] model_kwargs = { **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), @@ -2484,7 +2494,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_scheduled_tokens, remove_lora): if self.supports_mm_inputs: input_ids = None - inputs_embeds = self.inputs_embeds[:num_tokens] + inputs_embeds = self.inputs_embeds.gpu[:num_tokens] model_kwargs = { **self._init_model_kwargs(num_tokens), **self._dummy_mm_kwargs(num_reqs), From eddaafc1c77b0690194cbd1b73747d572793838c Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 6 Sep 2025 02:33:19 -0700 Subject: [PATCH 16/23] [Multimodal] Improve max video embedding length estimation in V1 (#24312) Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 5 +---- vllm/model_executor/models/qwen2_vl.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index e4ac0cd919101..bc340a9e2d8f8 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -216,12 +216,9 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): seq_len: int, mm_counts: Mapping[str, int], ) -> int: - max_images = mm_counts.get("image", 0) max_videos = mm_counts.get("video", 0) - max_image_tokens = self.get_max_image_tokens() * max_images - max_total_frames = self._get_max_video_frames(seq_len - - max_image_tokens) + max_total_frames = self._get_max_video_frames(seq_len) max_frames_per_video = min(max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ae7a8d8d7a5b9..b708719e4f9b8 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -915,12 +915,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): seq_len: int, mm_counts: Mapping[str, int], ) -> int: - max_images = mm_counts.get("image", 0) max_videos = mm_counts.get("video", 0) - max_image_tokens = self.get_max_image_tokens() * max_images - max_total_frames = self._get_max_video_frames(seq_len - - max_image_tokens) + max_total_frames = self._get_max_video_frames(seq_len) max_frames_per_video = min(max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO) From b121ca22ad9a648513af098052915dd0afb1dd47 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 6 Sep 2025 06:31:56 -0700 Subject: [PATCH 17/23] [CI] Disable flaky structured output test from CI (#24366) Signed-off-by: Roger Wang --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index cd82eb2ac4199..fb49db8f1611d 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -41,8 +41,9 @@ EAGLE_SPEC_CONFIG = { PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None), ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None), - ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", - None), + #FIXME: This test is flaky on CI thus disabled + #("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", + # None), ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None), ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None), ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None), From 0077c8634e122d7161c3a72fc15e7c1fb8bb230b Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Sat, 6 Sep 2025 10:03:35 -0400 Subject: [PATCH 18/23] Add @benchislett to codeowner for spec decode and structured outputs (#24362) Signed-off-by: Benjamin Chislett --- .github/CODEOWNERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d2839deccbf2a..682b27ac8986e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -27,7 +27,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # vLLM V1 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat -/vllm/v1/structured_output @mgoin @russellb @aarnphm +/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett +/vllm/v1/spec_decode @benchislett /vllm/v1/attention/backends/triton_attn.py @tdoublep # Test ownership From 0eadaeff7e51ada11771b31d2056bda05b012f5c Mon Sep 17 00:00:00 2001 From: mohankku Date: Sat, 6 Sep 2025 08:17:03 -0700 Subject: [PATCH 19/23] [Bugfix] Avoid uninitialized usage of azp_val when AZP is false. (#24335) Signed-off-by: Mohan Kumar Kumar Signed-off-by: mohankku --- csrc/cpu/dnnl_kernels.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp index 1aa99614926df..9a3af4ac9d8a6 100644 --- a/csrc/cpu/dnnl_kernels.cpp +++ b/csrc/cpu/dnnl_kernels.cpp @@ -145,7 +145,8 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, } } - float scale_val, azp_val; + float scale_val; + float azp_val = 0.0f; if constexpr (AZP) { float max_scalar = max_value.reduce_max(); float min_scalar = min_value.reduce_min(); From 00a4e56d8dd470615f0dde2e4c996ed5564da35f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 7 Sep 2025 00:23:12 +0800 Subject: [PATCH 20/23] [Bugfix] Fix broken deepseek fp8 TP weights loading (#24367) Signed-off-by: Isotr0py --- vllm/model_executor/layers/linear.py | 4 +++- vllm/model_executor/layers/quantization/fp8.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index fa8a261db7d7d..fd88eac55cb51 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -262,7 +262,7 @@ class LinearBase(CustomOp): self.tp_size = (get_tensor_model_parallel_world_size() if not disable_tp else 1) - def __post_init__(self): + def update_param_tp_status(self): for param in self.parameters(): if isinstance(param, BasevLLMParameter): param.tp_rank = self.tp_rank @@ -459,6 +459,7 @@ class ColumnParallelLinear(LinearBase): }) else: self.register_parameter("bias", None) + self.update_param_tp_status() def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): @@ -1250,6 +1251,7 @@ class RowParallelLinear(LinearBase): }) else: self.register_parameter("bias", None) + self.update_param_tp_status() def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): input_dim = getattr(param, "input_dim", None) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index de22cceb45d1e..65e0b70621532 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -270,7 +270,8 @@ class Fp8LinearMethod(LinearMethodBase): layer.weight_block_size = None if self.block_quant: - tp_size = get_tensor_model_parallel_world_size() + tp_size = getattr(layer, "tp_size", + get_tensor_model_parallel_world_size()) assert self.quant_config.weight_block_size is not None layer.weight_block_size = self.quant_config.weight_block_size block_n, block_k = ( From 7555d6b34af1aad14786a9451ff69968316a3ab4 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 7 Sep 2025 00:32:03 +0800 Subject: [PATCH 21/23] [Bugfix] Fix test_mixtral_moe (#24371) --- tests/kernels/moe/test_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 0ea9667914fd5..850c486b95240 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -371,8 +371,8 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, @pytest.mark.parametrize( "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) @torch.inference_mode() -def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, - monkeypatch): +def test_mixtral_moe(dist_init, dtype: torch.dtype, padding: bool, + use_rocm_aiter: bool, monkeypatch): """Make sure our Mixtral MoE implementation agrees with the one from huggingface.""" From 6024d115cdd4d23b117d4ba9bd27d7e0311fed19 Mon Sep 17 00:00:00 2001 From: Ashwin Phadke Date: Sat, 6 Sep 2025 22:12:19 +0530 Subject: [PATCH 22/23] Lora bias(enable_lora_bias) deprecate warning (#24339) Signed-off-by: Jee Jee Li Co-authored-by: Jee Jee Li --- vllm/config/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index c4434c37f4c76..41322f4f2a25d 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2458,7 +2458,6 @@ class LoRAConfig: LoRA adapter. Will be removed in v0.12.0.""" lora_vocab_padding_size: ClassVar[int] = current_platform\ .get_lora_vocab_padding_size() - default_mm_loras: Optional[dict[str, str]] = None """Dictionary mapping specific modalities to LoRA model paths; this field is only applicable to multimodal models and should be leveraged when a @@ -2470,7 +2469,8 @@ class LoRAConfig: will be automatically assigned to 1-n with the names of the modalities in alphabetic order.""" bias_enabled: bool = False - """Enable bias for LoRA adapters.""" + """[DEPRECATED] Enable bias for LoRA adapters. This option will be + removed in v0.12.0.""" def compute_hash(self) -> str: """ @@ -2503,6 +2503,11 @@ class LoRAConfig: "in v0.12.0. Additional vocabulary support for " "LoRA adapters is being phased out.") + # Deprecation warning for enable_lora_bias + if self.bias_enabled: + logger.warning("`enable_lora_bias` is deprecated " + "and will be removed in v0.12.0.") + # Setting the maximum rank to 512 should be able to satisfy the vast # majority of applications. possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512) From fb691ee4e776a5fa6780e3752884dc5e0c5ccda1 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Sat, 6 Sep 2025 15:10:32 -0400 Subject: [PATCH 23/23] [Fix] [gpt-oss] fix non-tool calling path for chat completion (#24324) --- tests/entrypoints/openai/test_serving_chat.py | 70 ++++++++++++++----- vllm/entrypoints/openai/serving_chat.py | 51 ++++++++------ 2 files changed, 83 insertions(+), 38 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index c609cfb5c0678..04805dbca74fa 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -36,21 +36,41 @@ def monkeypatch_module(): mpatch.undo() +@pytest.fixture(scope="module", + params=[True, False], + ids=["with_tool_parser", "without_tool_parser"]) +def with_tool_parser(request) -> bool: + return request.param + + @pytest.fixture(scope="module") -def gptoss_server(monkeypatch_module: pytest.MonkeyPatch): - with monkeypatch_module.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1") - args = [ - "--enforce-eager", - "--max-model-len", - "8192", +def default_server_args(with_tool_parser: bool): + args = [ + # use half precision for speed and memory savings in CI environment + "--enforce-eager", + "--max-model-len", + "4096", + "--reasoning-parser", + "openai_gptoss", + "--gpu-memory-utilization", + "0.8", + ] + if with_tool_parser: + args.extend([ "--tool-call-parser", "openai", - "--reasoning-parser", - "openai_gptoss", "--enable-auto-tool-choice", - ] - with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, args) as remote_server: + ]) + return args + + +@pytest.fixture(scope="module") +def gptoss_server(monkeypatch_module: pytest.MonkeyPatch, + default_server_args: list[str]): + with monkeypatch_module.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1") + with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, + default_server_args) as remote_server: yield remote_server @@ -61,7 +81,8 @@ async def gptoss_client(gptoss_server): @pytest.mark.asyncio -async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI): +async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI, + with_tool_parser: bool): tools = [{ "type": "function", "function": { @@ -94,10 +115,14 @@ async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI): ] stream = await gptoss_client.chat.completions.create( - model=GPT_OSS_MODEL_NAME, messages=messages, tools=tools, stream=True) + model=GPT_OSS_MODEL_NAME, + messages=messages, + tools=tools if with_tool_parser else None, + stream=True) name = None args_buf = "" + content_buf = "" async for chunk in stream: delta = chunk.choices[0].delta if delta.tool_calls: @@ -106,13 +131,22 @@ async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI): name = tc.function.name if tc.function and tc.function.arguments: args_buf += tc.function.arguments - - assert name is not None - assert len(args_buf) > 0 + if getattr(delta, "content", None): + content_buf += delta.content + if with_tool_parser: + assert name is not None + assert len(args_buf) > 0 + else: + assert name is None + assert len(args_buf) == 0 + assert len(content_buf) > 0 @pytest.mark.asyncio -async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI): +async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI, + with_tool_parser: bool): + if not with_tool_parser: + pytest.skip("skip non-tool for multi-turn tests") tools = [{ "type": "function", "function": { @@ -175,7 +209,7 @@ async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI): ) second_msg = second.choices[0].message assert (second_msg.content is not None and len(second_msg.content) > 0) or \ - (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0) # noqa: E501 + (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0) MODEL_NAME = "openai-community/gpt2" diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 4cc22787a0208..5c7adc53f49b2 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -6,7 +6,7 @@ import json import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence -from typing import TYPE_CHECKING, Callable, Final, Optional, Union +from typing import Callable, Final, Optional, Union import jinja2 import partial_json_parser @@ -1174,6 +1174,7 @@ class OpenAIServingChat(OpenAIServing): for output in final_res.outputs: token_ids = output.token_ids out_logprobs = output.logprobs + tool_call_info = None if request.logprobs and request.top_logprobs is not None: assert out_logprobs is not None, "Did not output logprobs" @@ -1188,32 +1189,42 @@ class OpenAIServingChat(OpenAIServing): logprobs = None if self.use_harmony: - if TYPE_CHECKING: - assert self.tool_parser is not None - tool_parser = self.tool_parser(tokenizer) - # NOTE: We use token_ids for openai tool parser - tool_call_info = tool_parser.extract_tool_calls( - "", - request=request, - token_ids=token_ids, # type: ignore - ) - reasoning_content, content = None, tool_call_info.content - if request.include_reasoning: + if self.tool_parser is not None: + tool_parser = self.tool_parser(tokenizer) + # NOTE: We use token_ids for openai tool parser + tool_call_info = tool_parser.extract_tool_calls( + "", + request=request, + token_ids=token_ids, # type: ignore + ) + reasoning_content, content = None, tool_call_info.content + if request.include_reasoning: + reasoning_content, content, _ = parse_chat_output( + token_ids) + message = ChatMessage( + role=role, + reasoning_content=reasoning_content, + content=content, + tool_calls=tool_call_info.tool_calls, + ) + else: reasoning_content, content, _ = parse_chat_output( token_ids) - message = ChatMessage( - role=role, - reasoning_content=reasoning_content, - content=content, - tool_calls=tool_call_info.tool_calls, - ) + if not request.include_reasoning: + reasoning_content = None + message = ChatMessage( + role=role, + reasoning_content=reasoning_content, + content=content, + ) choice_data = ChatCompletionResponseChoice( index=output.index, message=message, logprobs=logprobs, - finish_reason="tool_calls" - if tool_call_info.tools_called else + finish_reason="tool_calls" if + (tool_call_info is not None + and tool_call_info.tools_called) else output.finish_reason if output.finish_reason else "stop", stop_reason=output.stop_reason, )