From 9f04a6cf577b69ee2365be986a38f74431dee6d0 Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Tue, 19 Aug 2025 12:17:31 -0700 Subject: [PATCH] Revert "Merge commit '6e8d8c4afbddf725b34ef938616701869f5b3462' into sage/dbo-full-cudagraphsh" This reverts commit 5215c80a4988e81d2f5971e02d50d3785cab5ae8, reversing changes made to dd2a94fd9d90d0c04772380c220b5ec81bd0b61e. --- .buildkite/nightly-benchmarks/README.md | 1 + .../tests/genai-perf-tests.json | 1 + .../tests/nightly-tests.json | 6 + .../tests/serving-tests-cpu-snc2.json | 6 + .../tests/serving-tests-cpu-snc3.json | 6 + .../tests/serving-tests-cpu.json | 5 + .../tests/serving-tests.json | 6 +- .buildkite/test-pipeline.yaml | 25 +- SECURITY.md | 40 +- benchmarks/kernels/benchmark_moe.py | 9 +- .../benchmark_reshape_and_cache_flash.py | 156 ------- .../benchmark_fp8_block_dense_gemm.py | 45 +- csrc/cache_kernels.cu | 92 +--- csrc/moe/topk_softmax_kernels.cu | 6 +- docker/Dockerfile | 57 +-- docs/features/multimodal_inputs.md | 44 -- docs/features/spec_decode.md | 4 - docs/features/structured_outputs.md | 2 +- .../installation/cpu/apple.inc.md | 12 +- .../installation/cpu/build.inc.md | 22 +- .../installation/cpu/s390x.inc.md | 45 +- docs/models/supported_models.md | 1 - examples/offline_inference/vision_language.py | 332 +++++++------- .../vision_language_multi_image.py | 215 ++++------ .../disagg_proxy_demo.py | 2 - .../disagg_proxy_p2p_nccl_xpyd.py | 2 - requirements/common.txt | 2 +- requirements/cuda.txt | 4 +- requirements/test.txt | 24 +- setup.py | 186 ++++---- tests/compile/test_fusion_all_reduce.py | 15 +- tests/config/test_mp_reducer.py | 1 + tests/engine/test_arg_utils.py | 7 +- .../kernels/moe/modular_kernel_tools/utils.py | 31 +- tests/kernels/moe/test_batched_deepgemm.py | 103 ----- .../kernels/moe/test_cutlass_grouped_gemm.py | 21 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 5 +- tests/kernels/moe/test_deepgemm.py | 20 +- tests/kernels/moe/utils.py | 4 +- tests/kernels/quant_utils.py | 19 + tests/kernels/quantization/test_block_fp8.py | 2 +- .../quantization/test_cutlass_scaled_mm.py | 5 + tests/lora/test_multi_loras_with_tp.py | 158 ------- tests/lora/test_qwen2vl.py | 6 + .../multimodal/generation/test_common.py | 27 +- .../generation/vlm_utils/model_utils.py | 12 - .../multimodal/processing/test_glm4_1v.py | 51 --- .../processing/test_transformers.py | 2 +- tests/models/registry.py | 4 +- tests/models/test_transformers.py | 28 +- tests/mq_llm_engine/test_load.py | 2 +- tests/multimodal/test_image.py | 115 +---- tests/multimodal/test_processing.py | 113 ++--- .../speculators/test_eagle3.py | 26 -- tests/tpu/lora/test_lora.py | 23 +- tests/v1/engine/test_async_llm.py | 4 +- tests/v1/engine/test_llm_engine.py | 26 -- tests/v1/test_async_llm_dp.py | 1 + vllm/attention/backends/flashinfer.py | 46 +- vllm/compilation/decorators.py | 11 +- vllm/compilation/fusion_attn.py | 3 - vllm/compilation/inductor_pass.py | 3 +- vllm/compilation/pass_manager.py | 2 +- vllm/config.py | 90 +--- .../device_communicators/ray_communicator.py | 257 ----------- vllm/engine/arg_utils.py | 74 +--- vllm/engine/async_llm_engine.py | 26 +- vllm/engine/multiprocessing/engine.py | 27 +- vllm/entrypoints/cli/serve.py | 25 +- vllm/entrypoints/openai/api_server.py | 29 +- vllm/entrypoints/openai/protocol.py | 9 - vllm/entrypoints/openai/run_batch.py | 6 +- vllm/envs.py | 30 +- vllm/executor/multiproc_worker_utils.py | 42 +- vllm/executor/ray_distributed_executor.py | 15 - vllm/inputs/registry.py | 17 +- vllm/lora/layers.py | 8 +- .../layers/fused_moe/deep_gemm_moe.py | 77 +--- .../fused_moe/deepep_ht_prepare_finalize.py | 13 +- .../layers/fused_moe/fused_moe.py | 4 +- .../compressed_tensors_moe.py | 53 +-- .../layers/quantization/gptq.py | 22 +- .../layers/quantization/modelopt.py | 150 +++++-- .../quantization/utils/flashinfer_fp4_moe.py | 154 ------- .../quantization/utils/nvfp4_moe_support.py | 59 --- .../model_loader/bitsandbytes_loader.py | 39 +- vllm/model_executor/models/aya_vision.py | 12 +- vllm/model_executor/models/deepseek_vl2.py | 36 +- vllm/model_executor/models/florence2.py | 6 + vllm/model_executor/models/fuyu.py | 4 +- vllm/model_executor/models/gemma3n.py | 16 +- vllm/model_executor/models/glm4_1v.py | 16 +- vllm/model_executor/models/h2ovl.py | 16 +- .../models/hyperclovax_vision.py | 20 +- vllm/model_executor/models/idefics3.py | 10 +- vllm/model_executor/models/internvl.py | 28 +- vllm/model_executor/models/keye.py | 84 +++- vllm/model_executor/models/llama_eagle3.py | 26 +- vllm/model_executor/models/llava.py | 46 +- vllm/model_executor/models/minicpmv.py | 6 +- vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/nemotron_vl.py | 24 +- vllm/model_executor/models/nvlm_d.py | 16 +- vllm/model_executor/models/ovis.py | 8 +- vllm/model_executor/models/phi3v.py | 11 + vllm/model_executor/models/phi4_multimodal.py | 22 +- vllm/model_executor/models/phi4mm.py | 21 +- vllm/model_executor/models/qwen2.py | 21 +- .../models/qwen2_5_omni_thinker.py | 47 +- vllm/model_executor/models/qwen2_5_vl.py | 63 ++- vllm/model_executor/models/qwen2_audio.py | 18 +- vllm/model_executor/models/qwen2_vl.py | 82 +++- vllm/model_executor/models/qwen3.py | 7 - vllm/model_executor/models/registry.py | 3 +- vllm/model_executor/models/skyworkr1v.py | 86 +++- vllm/model_executor/models/smolvlm.py | 10 +- vllm/model_executor/models/tarsier.py | 12 +- vllm/model_executor/models/transformers.py | 17 +- vllm/model_executor/models/ultravox.py | 20 +- vllm/model_executor/models/whisper.py | 15 +- vllm/model_executor/utils.py | 20 +- vllm/multimodal/image.py | 37 +- vllm/transformers_utils/config.py | 32 +- vllm/transformers_utils/configs/__init__.py | 2 - .../configs/speculators/__init__.py | 2 - .../configs/speculators/algos.py | 32 -- .../configs/speculators/base.py | 91 ---- vllm/transformers_utils/processor.py | 92 ++-- vllm/utils/__init__.py | 107 ++--- vllm/utils/deep_gemm.py | 76 +--- vllm/utils/flashinfer.py | 81 +--- vllm/v1/attention/backends/flash_attn.py | 32 +- vllm/v1/attention/backends/flashinfer.py | 404 ++++-------------- vllm/v1/attention/backends/mla/common.py | 23 +- vllm/v1/attention/backends/mla/flashmla.py | 9 +- .../attention/backends/mla/rocm_aiter_mla.py | 4 +- vllm/v1/attention/backends/triton_attn.py | 6 +- vllm/v1/attention/backends/utils.py | 18 +- vllm/v1/core/kv_cache_utils.py | 6 +- vllm/v1/engine/async_llm.py | 34 +- vllm/v1/engine/coordinator.py | 122 ++---- vllm/v1/engine/core.py | 96 ++--- vllm/v1/engine/core_client.py | 46 +- vllm/v1/engine/processor.py | 9 +- vllm/v1/executor/multiproc_executor.py | 14 +- vllm/v1/metrics/stats.py | 4 - vllm/v1/utils.py | 1 - vllm/v1/worker/gpu_model_runner.py | 24 +- vllm/v1/worker/gpu_worker.py | 5 - 149 files changed, 1970 insertions(+), 3770 deletions(-) delete mode 100644 benchmarks/kernels/benchmark_reshape_and_cache_flash.py delete mode 100644 tests/kernels/moe/test_batched_deepgemm.py delete mode 100644 tests/lora/test_multi_loras_with_tp.py delete mode 100644 tests/models/multimodal/processing/test_glm4_1v.py delete mode 100644 tests/speculative_decoding/speculators/test_eagle3.py delete mode 100644 vllm/distributed/device_communicators/ray_communicator.py delete mode 100644 vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py delete mode 100644 vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py delete mode 100644 vllm/transformers_utils/configs/speculators/__init__.py delete mode 100644 vllm/transformers_utils/configs/speculators/algos.py delete mode 100644 vllm/transformers_utils/configs/speculators/base.py diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index 3721d3d1d6749..fcde284efea98 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -104,6 +104,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", + "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json index f26ae7634f3d9..edbe9f2df0ce0 100644 --- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -11,6 +11,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", + "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 41b4a4008801d..fda1a7a3ec53c 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -35,6 +35,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", + "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -89,6 +90,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", + "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -143,6 +145,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", + "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -194,6 +197,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", + "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -247,6 +251,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", + "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -300,6 +305,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", + "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json index dd0e24edff98d..a144b4420fbf1 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -17,6 +17,7 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -49,6 +50,7 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -81,6 +83,7 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -114,6 +117,7 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -149,6 +153,7 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -184,6 +189,7 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json index f1bda65a7590b..e6e69b63b74df 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -17,6 +17,7 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -49,6 +50,7 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -82,6 +84,7 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -115,6 +118,7 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -150,6 +154,7 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -186,6 +191,7 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index f150b9abeea45..ce1f924de387f 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -17,6 +17,7 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -49,6 +50,7 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -81,6 +83,7 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -114,6 +117,7 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -149,6 +153,7 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", + "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json index a6d4141d5c2dc..13fd5aa8db97b 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -7,6 +7,7 @@ "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", + "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -25,6 +26,7 @@ "tensor_parallel_size": 4, "swap_space": 16, "disable_log_stats": "", + "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -43,6 +45,7 @@ "tensor_parallel_size": 2, "swap_space": 16, "disable_log_stats": "", + "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -57,7 +60,8 @@ "test_name": "serving_llama70B_tp4_sharegpt_specdecode", "qps_list": [2], "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "disable_log_requests": "", "tensor_parallel_size": 4, "swap_space": 16, "speculative_config": { diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index cc1223d4c4653..2f6cc45be77e6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -647,31 +647,13 @@ steps: - label: Blackwell Test working_dir: "/vllm-workspace/" gpu: b200 - # optional: true + optional: true source_file_dependencies: - - csrc/quantization/fp4/ - - csrc/attention/mla/ - - csrc/quantization/cutlass_w8a8/moe/ - - vllm/model_executor/layers/fused_moe/cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/fusion.py + - csrc/ + - vllm/ commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py - # Attention - # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py - - pytest -v -s tests/kernels/test_cutlass_mla_decode.py - # Quantization - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - # Fusion - - pytest -v -s tests/compile/test_fusion_all_reduce.py ##### 1 GPU test ##### ##### multi gpus test ##### @@ -822,7 +804,6 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_multi_loras_with_tp.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/SECURITY.md b/SECURITY.md index 414669fb3712e..6053cfb41f35b 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,45 +1,13 @@ # Security Policy -## Reporting security issues +## Reporting a Vulnerability -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). +If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -## Issue triage +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). -Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). - -## Threat model +--- Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations. Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. - -## Issue severity - -We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories: - -### CRITICAL Severity - -Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0. - -### HIGH Severity - -Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9 - -### MODERATE Severity - -Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9 - -### LOW Severity - -Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0 - -## Prenotification policy - -For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues. - -* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release. - -* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis. - -* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included. diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 72250e2fb6d2b..c350aaf5d3ad2 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -22,13 +22,6 @@ from vllm.utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() -def ensure_divisibility(numerator, denominator): - """Ensure that numerator is divisible by the denominator.""" - assert numerator % denominator == 0, ( - "intermediate_size {} is not divisible by tp {}.".format(numerator, denominator) - ) - - class BenchmarkConfig(TypedDict): BLOCK_SIZE_M: int BLOCK_SIZE_N: int @@ -610,7 +603,7 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size - ensure_divisibility(intermediate_size, args.tp_size) + hidden_size = config.hidden_size dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py deleted file mode 100644 index d4648c18f31d5..0000000000000 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ /dev/null @@ -1,156 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - -import random -import time - -import torch -from tabulate import tabulate - -from vllm import _custom_ops as ops -from vllm.logger import init_logger -from vllm.platforms import current_platform -from vllm.utils import ( - STR_DTYPE_TO_TORCH_DTYPE, - FlexibleArgumentParser, - create_kv_caches_with_random_flash, -) - -logger = init_logger(__name__) - - -@torch.inference_mode() -def run_benchmark( - num_tokens: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - kv_cache_dtype: str, - kv_cache_layout: str, - num_iters: int, - device: str = "cuda", -) -> float: - """Return latency (seconds) for given num_tokens.""" - - if kv_cache_dtype == "fp8" and head_size % 16: - raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") - - current_platform.seed_everything(42) - torch.set_default_device(device) - - # create random key / value tensors [T, H, D]. - key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device) - value = torch.randn_like(key) - - # prepare the slot mapping. - # each token is assigned a unique slot in the KV-cache. - num_slots = block_size * num_blocks - if num_tokens > num_slots: - raise ValueError("num_tokens cannot exceed the total number of cache slots") - slot_mapping_lst = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device) - - key_caches, value_caches = create_kv_caches_with_random_flash( - num_blocks, - block_size, - 1, # num_layers - num_heads, - head_size, - kv_cache_dtype, - dtype, - device=device, - cache_layout=kv_cache_layout, - ) - key_cache, value_cache = key_caches[0], value_caches[0] - - # compute per-kernel scaling factors for fp8 conversion (if used). - k_scale = (key.amax() / 64.0).to(torch.float32) - v_scale = (value.amax() / 64.0).to(torch.float32) - - def run_cuda_benchmark(n_iters: int) -> float: - nonlocal key, value, key_cache, value_cache, slot_mapping - torch.cuda.synchronize() - start = time.perf_counter() - for _ in range(n_iters): - ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - slot_mapping, - kv_cache_dtype, - k_scale, - v_scale, - ) - torch.cuda.synchronize() - end = time.perf_counter() - return (end - start) / n_iters - - # warm-up - run_cuda_benchmark(3) - - lat = run_cuda_benchmark(num_iters) - - # free tensors to mitigate OOM when sweeping - del key, value, key_cache, value_cache, slot_mapping - torch.cuda.empty_cache() - - return lat - - -def main(args): - rows = [] - for layout in ["NHD", "HND"]: - for exp in range(1, 17): - n_tok = 2**exp - lat = run_benchmark( - num_tokens=n_tok, - num_heads=args.num_heads, - head_size=args.head_size, - block_size=args.block_size, - num_blocks=args.num_blocks, - dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], - kv_cache_dtype=args.kv_cache_dtype, - kv_cache_layout=layout, - num_iters=args.iters, - device="cuda", - ) - rows.append([n_tok, layout, f"{lat * 1e6:.3f}"]) - - print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"])) - - -if __name__ == "__main__": - parser = FlexibleArgumentParser() - - parser.add_argument("--num-heads", type=int, default=128) - parser.add_argument( - "--head-size", - type=int, - choices=[64, 80, 96, 112, 120, 128, 192, 256], - default=128, - ) - parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) - parser.add_argument("--num-blocks", type=int, default=128 * 512) - - parser.add_argument( - "--dtype", - type=str, - choices=["half", "bfloat16", "float"], - default="bfloat16", - ) - - parser.add_argument( - "--kv-cache-dtype", - type=str, - choices=["auto", "fp8"], - default="auto", - ) - - parser.add_argument("--iters", type=int, default=100) - args = parser.parse_args() - - main(args) diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index b99c2099f2c38..43c54d56ca8c1 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -4,16 +4,49 @@ # ruff: noqa: E501 import time +# Import DeepGEMM functions +import deep_gemm import torch +from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor +# Import vLLM functions from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - get_col_major_tma_aligned_tensor, per_token_group_quant_fp8, w8a8_block_fp8_matmul, ) from vllm.triton_utils import triton -from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8 + + +# Copied from +# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9 +def per_token_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tensor to FP8 format with per-token scaling.""" + assert x.dim() == 2 and x.size(1) % 128 == 0 + m, n = x.shape + x_view = x.view(m, -1, 128) + x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) + return (x_view * (448.0 / x_amax.unsqueeze(2))).to( + torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1) + + +# Copied from +# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17 +def per_block_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tensor to FP8 format with per-block scaling.""" + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( + x_amax / 448.0).view(x_view.size(0), x_view.size(2)) def benchmark_shape(m: int, @@ -36,14 +69,14 @@ def benchmark_shape(m: int, # Pre-quantize B for all implementations # (weights can be pre-quantized offline) - B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) - B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) + B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B) + B_vllm, B_scale_vllm = per_block_cast_to_fp8(B) # Block size configuration block_size = [128, 128] # Pre-quantize A for all implementations - A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1]) + A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) @@ -52,7 +85,7 @@ def benchmark_shape(m: int, # === DeepGEMM Implementation === def deepgemm_gemm(): - fp8_gemm_nt((A_deepgemm, A_scale_deepgemm), + deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm) return C_deepgemm diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 131dcb15cd7e9..88559c8fe7183 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -5,7 +5,6 @@ #include "cuda_utils.h" #include "cuda_compat.h" #include "dispatch_utils.h" -#include "quantization/vectorization_utils.cuh" #ifdef USE_ROCM #include "quantization/fp8/amd/quant_utils.cuh" @@ -262,26 +261,14 @@ __global__ void reshape_and_cache_kernel( } } -// Used by vectorization_utils to copy/convert one element -template -struct CopyWithScaleOp { - float scale; - - __device__ __forceinline__ void operator()(OutT& dst, const InT src) const { - if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - dst = static_cast(src); - } else { - dst = fp8::scaled_convert(src, scale); - } - } -}; - template __global__ void reshape_and_cache_flash_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - cache_t* __restrict__ key_cache, // NHD or HND, shape see comments below - cache_t* __restrict__ value_cache, // same above + cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads, + // head_size] + cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads, + // head_size] const int64_t* __restrict__ slot_mapping, // [num_tokens] const int64_t block_stride, const int64_t page_stride, const int64_t head_stride, const int64_t key_stride, @@ -295,58 +282,25 @@ __global__ void reshape_and_cache_flash_kernel( } const int64_t block_idx = slot_idx / block_size; const int64_t block_offset = slot_idx % block_size; - const int n_elems = num_heads * head_size; - - // pointers to the beginning of the source row for this token. - const scalar_t* __restrict__ key_src = key + token_idx * key_stride; - const scalar_t* __restrict__ value_src = value + token_idx * value_stride; - - // find the start position inside the kv-cache for this token. - cache_t* __restrict__ key_dst = - key_cache + block_idx * block_stride + block_offset * page_stride; - cache_t* __restrict__ value_dst = - value_cache + block_idx * block_stride + block_offset * page_stride; - - // this is true for the NHD layout where `head_stride == head_size` - const bool is_contiguous_heads = (head_stride == head_size); - - float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale; - float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale; - constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4; - CopyWithScaleOp k_op{k_scale_val}; - CopyWithScaleOp v_op{v_scale_val}; - if (is_contiguous_heads) { - // NHD layout - // kv cache: [num_blocks, block_size, num_heads, head_size] - vectorize_with_alignment(key_src, key_dst, n_elems, threadIdx.x, - blockDim.x, k_op); - - vectorize_with_alignment(value_src, value_dst, n_elems, - threadIdx.x, blockDim.x, v_op); - - } else { - // HND layout: heads are strided, but each head_size segment is contiguous - // kv cache: [num_blocks, num_heads, block_size, head_size] - const int lane = threadIdx.x & 31; // 0..31 within warp - const int warp_id = threadIdx.x >> 5; // warp index within block - const int warps_per_block = blockDim.x >> 5; - - for (int head = warp_id; head < num_heads; head += warps_per_block) { - const scalar_t* __restrict__ k_src_h = key_src + head * head_size; - const scalar_t* __restrict__ v_src_h = value_src + head * head_size; - - cache_t* __restrict__ k_dst_h = - key_dst + static_cast(head) * head_stride; - cache_t* __restrict__ v_dst_h = - value_dst + static_cast(head) * head_stride; - - // within each head, let the 32 threads of the warp perform the vector - // copy - vectorize_with_alignment(k_src_h, k_dst_h, head_size, lane, 32, - k_op); - - vectorize_with_alignment(v_src_h, v_dst_h, head_size, lane, 32, - v_op); + const int n = num_heads * head_size; + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int64_t src_key_idx = token_idx * key_stride + i; + const int64_t src_value_idx = token_idx * value_stride + i; + const int head_idx = i / head_size; + const int head_offset = i % head_size; + const int64_t tgt_key_value_idx = block_idx * block_stride + + block_offset * page_stride + + head_idx * head_stride + head_offset; + scalar_t tgt_key = key[src_key_idx]; + scalar_t tgt_value = value[src_value_idx]; + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + key_cache[tgt_key_value_idx] = tgt_key; + value_cache[tgt_key_value_idx] = tgt_value; + } else { + key_cache[tgt_key_value_idx] = + fp8::scaled_convert(tgt_key, *k_scale); + value_cache[tgt_key_value_idx] = + fp8::scaled_convert(tgt_value, *v_scale); } } } diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 7a7865b901de1..0b505d2e04a21 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -24,12 +24,9 @@ #ifndef USE_ROCM #include #include - #include - using AddOp = cuda::std::plus; #else #include #include - using AddOp = cub::Sum; #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -65,6 +62,7 @@ __launch_bounds__(TPB) __global__ const int thread_row_offset = blockIdx.x * num_cols; + cub::Sum sum; float threadData(-FLT_MAX); // Don't touch finished rows. @@ -94,7 +92,7 @@ __launch_bounds__(TPB) __global__ threadData += exp((static_cast(input[idx]) - float_max)); } - const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp()); + const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum); if (threadIdx.x == 0) { diff --git a/docker/Dockerfile b/docker/Dockerfile index 0d6afca74e867..69aeee67a4300 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,3 +1,4 @@ + # The vLLM Dockerfile is used to construct vLLM image that can be directly used # to run the OpenAI compatible server. @@ -15,7 +16,6 @@ ARG PYTHON_VERSION=3.12 # Example: # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -# TODO: Restore to base image after FlashInfer AOT wheel fixed ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 # By parameterizing the Deadsnakes repository URL, we allow third-party to use @@ -206,16 +206,7 @@ ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels -ARG VLLM_USE_PRECOMPILED -# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed -ENV VLLM_USE_PRECOMPILED="" -RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \ - export VLLM_USE_PRECOMPILED=1 && \ - echo "Using precompiled wheels"; \ - else \ - unset VLLM_USE_PRECOMPILED && \ - echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \ - fi +ARG VLLM_USE_PRECOMPILED="" # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -232,6 +223,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ + && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ + && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && sccache --show-stats; \ @@ -245,9 +238,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Clean any existing CMake artifacts rm -rf .deps && \ mkdir -p .deps && \ + export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \ + export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi +# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others +RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \ + echo "Cleaning up extra wheels in dist/..." && \ + # Identify the most recent manylinux1_x86_64 wheel + KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \ + if [ -n "$KEEP_WHEEL" ]; then \ + echo "Keeping wheel: $KEEP_WHEEL"; \ + find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \ + fi; \ + fi + # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py @@ -283,6 +289,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ #################### vLLM installation IMAGE #################### # image with vLLM installed +# TODO: Restore to base image after FlashInfer AOT wheel fixed FROM ${FINAL_BASE_IMAGE} AS vllm-base ARG CUDA_VERSION ARG PYTHON_VERSION @@ -363,6 +370,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ fi # Install vllm wheel first, so that torch etc will be installed. +# !bang RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --system dist/*.whl --verbose \ @@ -427,33 +435,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/build.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') -# Install DeepGEMM from source -ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git" -ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1" -RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' - . /etc/environment - CUDA_MAJOR="${CUDA_VERSION%%.*}" - CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}" - CUDA_MINOR="${CUDA_MINOR%%.*}" - if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then - git clone --recursive --shallow-submodules \ - ${DEEPGEMM_GIT_REPO} deepgemm - echo "🏗️ Building DeepGEMM" - pushd deepgemm - git checkout ${DEEPGEMM_GIT_REF} - # Build DeepGEMM - # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh) - rm -rf build dist - rm -rf *.egg-info - python3 setup.py bdist_wheel - uv pip install --system dist/*.whl - popd - rm -rf deepgemm - else - echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})" - fi -BASH - #################### vLLM installation IMAGE #################### #################### TEST IMAGE #################### diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index cdd32924b5668..b8677f11a1d3c 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -172,36 +172,6 @@ Multi-image input can be extended to perform video captioning. We show this with print(generated_text) ``` -#### Custom RGBA Background Color - -When loading RGBA images (images with transparency), vLLM converts them to RGB format. By default, transparent pixels are replaced with white background. You can customize this background color using the `rgba_background_color` parameter in `media_io_kwargs`. - -??? code - - ```python - from vllm import LLM - - # Default white background (no configuration needed) - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - # Custom black background for dark theme - llm = LLM( - model="llava-hf/llava-1.5-7b-hf", - media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}} - ) - - # Custom brand color background (e.g., blue) - llm = LLM( - model="llava-hf/llava-1.5-7b-hf", - media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}} - ) - ``` - -!!! note - - The `rgba_background_color` accepts RGB values as a list `[R, G, B]` or tuple `(R, G, B)` where each value is 0-255 - - This setting only affects RGBA images with transparency; RGB images are unchanged - - If not specified, the default white background `(255, 255, 255)` is used for backward compatibility - ### Video Inputs You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary @@ -508,20 +478,6 @@ Full example: ``` -#### Custom RGBA Background Color - -To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`: - -```bash -# Example: Black background for dark theme -vllm serve llava-hf/llava-1.5-7b-hf \ - --media-io-kwargs '{"image": {"rgba_background_color": [0, 0, 0]}}' - -# Example: Custom gray background -vllm serve llava-hf/llava-1.5-7b-hf \ - --media-io-kwargs '{"image": {"rgba_background_color": [128, 128, 128]}}' -``` - ### Audio Inputs Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in). diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index 89d5b489e1888..be4b91feda7aa 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -15,10 +15,6 @@ Speculative decoding is a technique which improves inter-token latency in memory The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. -!!! warning - In vllm v0.10.0, speculative decoding with a draft model is not supported. - If you use the following code, you will get a `NotImplementedError`. - ??? code ```python diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 8a934d406f382..4f737afa80f55 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -103,7 +103,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", } ], - response_format={ + "response_format": { "type": "json_schema", "json_schema": { "name": "car-description", diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md index 2828173a76a9a..0816f38ac68a1 100644 --- a/docs/getting_started/installation/cpu/apple.inc.md +++ b/docs/getting_started/installation/cpu/apple.inc.md @@ -1,6 +1,6 @@ # --8<-- [start:installation] -vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS. +vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS. Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. @@ -23,20 +23,20 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] -After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from source. +After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -uv pip install -r requirements/cpu.txt -uv pip install -e . +pip install -r requirements/cpu.txt +pip install -e . ``` !!! note - On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device. + On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. !!! example "Troubleshooting" - If the build fails with errors like the following where standard C++ headers cannot be found, try to remove and reinstall your + If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your [Command Line Tools for Xcode](https://developer.apple.com/download/all/). ```text diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index 57a09e674a821..fa777fe0c8a1a 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -1,4 +1,4 @@ -First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: +First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: ```bash sudo apt-get update -y @@ -6,34 +6,28 @@ sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certific sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` -Second, clone the vLLM project: +Second, clone vLLM project: ```bash git clone https://github.com/vllm-project/vllm.git vllm_source cd vllm_source ``` -Third, install required dependencies: +Third, install Python packages for vLLM CPU backend building: ```bash -uv pip install -r requirements/cpu-build.txt --torch-backend auto -uv pip install -r requirements/cpu.txt --torch-backend auto +pip install --upgrade pip +pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu +pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` -??? console "pip" - ```bash - pip install --upgrade pip - pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - ``` - -Finally, build and install vLLM: +Finally, build and install vLLM CPU backend: ```bash VLLM_TARGET_DEVICE=cpu python setup.py install ``` -If you want to develop vLLM, install it in editable mode instead. +If you want to develop vllm, install it in editable mode instead. ```bash VLLM_TARGET_DEVICE=cpu python setup.py develop diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md index c1917267ce91b..acfb3396896bf 100644 --- a/docs/getting_started/installation/cpu/s390x.inc.md +++ b/docs/getting_started/installation/cpu/s390x.inc.md @@ -1,6 +1,6 @@ # --8<-- [start:installation] -vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform. +vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform. Currently the CPU implementation for s390x architecture supports FP32 datatype only. @@ -40,32 +40,21 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \ . "$HOME/.cargo/env" ``` -Execute the following commands to build and install vLLM from source. +Execute the following commands to build and install vLLM from the source. !!! tip - Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM. + Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. ```bash sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds - uv pip install -v \ - --torch-backend auto \ + pip install -v \ + --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ -r requirements-build.txt \ -r requirements-cpu.txt \ VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ - uv pip install dist/*.whl + pip install dist/*.whl ``` -??? console "pip" - ```bash - sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds - pip install -v \ - --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ - -r requirements-build.txt \ - -r requirements-cpu.txt \ - VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ - pip install dist/*.whl - ``` - # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] @@ -74,19 +63,19 @@ Execute the following commands to build and install vLLM from source. ```bash docker build -f docker/Dockerfile.s390x \ - --tag vllm-cpu-env . + --tag vllm-cpu-env . -# Launch OpenAI server +# Launching OpenAI server docker run --rm \ - --privileged true \ - --shm-size 4g \ - -p 8000:8000 \ - -e VLLM_CPU_KVCACHE_SPACE= \ - -e VLLM_CPU_OMP_THREADS_BIND= \ - vllm-cpu-env \ - --model meta-llama/Llama-3.2-1B-Instruct \ - --dtype float \ - other vLLM OpenAI server arguments + --privileged=true \ + --shm-size=4g \ + -p 8000:8000 \ + -e VLLM_CPU_KVCACHE_SPACE= \ + -e VLLM_CPU_OMP_THREADS_BIND= \ + vllm-cpu-env \ + --model=meta-llama/Llama-3.2-1B-Instruct \ + --dtype=float \ + other vLLM OpenAI server arguments ``` # --8<-- [end:build-image-from-source] diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 56c77a1e5f118..f5d9e3b22f2a6 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -713,7 +713,6 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | -| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ | ### Pooling Models diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index a75b8e2b047d8..6f23a29e72f71 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -423,6 +423,51 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ) +# SmolVLM2-2.2B-Instruct +def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + mm_processor_kwargs={ + "max_image_size": {"longest_edge": 384}, + }, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [ + (f"<|im_start|>User:{question}\nAssistant:") + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + +# omni-research/Tarsier-7b +def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "omni-research/Tarsier-7b" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # Intern-S1 def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -496,6 +541,44 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ) +# Nemontron_VL +def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData: + model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + limit_mm_per_prompt={modality: 1}, + ) + + assert modality == "image" + placeholder = "" + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"{placeholder}\n{question}"}] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # Stop tokens for InternVL + # models variants may have different stop tokens + # please refer to the model card for the correct "stop words": + # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + # Keye-VL def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: model_name = "Kwai-Keye/Keye-VL-8B-Preview" @@ -551,41 +634,6 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: ) -def run_llama4(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=4, - tensor_parallel_size=8, - gpu_memory_utilization=0.4, - limit_mm_per_prompt={modality: 1}, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [ - [ - { - "role": "user", - "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], - } - ] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ) - stop_token_ids = None - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - # LLaVA-1.5 def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -828,6 +876,41 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ) +def run_llama4(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=4, + tensor_parallel_size=8, + gpu_memory_utilization=0.4, + limit_mm_per_prompt={modality: 1}, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [ + [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], + } + ] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + stop_token_ids = None + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + # Molmo def run_molmo(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -853,44 +936,6 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: ) -# Nemontron_VL -def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData: - model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=8192, - limit_mm_per_prompt={modality: 1}, - ) - - assert modality == "image" - placeholder = "" - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - messages = [ - [{"role": "user", "content": f"{placeholder}\n{question}"}] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - # Stop tokens for InternVL - # models variants may have different stop tokens - # please refer to the model card for the correct "stop words": - # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py - stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - # NVLM-D def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1248,6 +1293,37 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: + model_name = "omni-research/Tarsier2-Recap-7b" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}, + limit_mm_per_prompt={modality: 1}, + ) + + if modality == "image": + placeholder = "<|image_pad|>" + elif modality == "video": + placeholder = "<|video_pad|>" + + prompts = [ + ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # SkyworkR1V def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1281,111 +1357,6 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: ) -# SmolVLM2-2.2B-Instruct -def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=2, - enforce_eager=True, - mm_processor_kwargs={ - "max_image_size": {"longest_edge": 384}, - }, - limit_mm_per_prompt={modality: 1}, - ) - prompts = [ - (f"<|im_start|>User:{question}\nAssistant:") - for question in questions - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - -# Step3 -def run_step3(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "stepfun-ai/step3-fp8" - - # NOTE: Below are verified configurations for step3-fp8 - # on 8xH100 GPUs. - engine_args = EngineArgs( - model=model_name, - max_num_batched_tokens=4096, - gpu_memory_utilization=0.85, - tensor_parallel_size=8, - limit_mm_per_prompt={modality: 1}, - reasoning_parser="step3", - ) - - prompts = [ - "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n " - f"{question} <|EOT|><|BOT|>assistant\n\n" - for question in questions - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - -# omni-research/Tarsier-7b -def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - model_name = "omni-research/Tarsier-7b" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=4096, - limit_mm_per_prompt={modality: 1}, - ) - prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - -def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: - model_name = "omni-research/Tarsier2-Recap-7b" - - engine_args = EngineArgs( - model=model_name, - max_model_len=4096, - hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}, - limit_mm_per_prompt={modality: 1}, - ) - - if modality == "image": - placeholder = "<|image_pad|>" - elif modality == "video": - placeholder = "<|video_pad|>" - - prompts = [ - ( - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" - f"{question}<|im_end|>\n" - "<|im_start|>assistant\n" - ) - for question in questions - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - model_example_map = { "aria": run_aria, "aya_vision": run_aya_vision, @@ -1402,9 +1373,9 @@ model_example_map = { "idefics3": run_idefics3, "interns1": run_interns1, "internvl_chat": run_internvl, + "nemotron_vl": run_nemotron_vl, "keye_vl": run_keye_vl, "kimi_vl": run_kimi_vl, - "llama4": run_llama4, "llava": run_llava, "llava-next": run_llava_next, "llava-next-video": run_llava_next_video, @@ -1414,8 +1385,8 @@ model_example_map = { "minicpmv": run_minicpmv, "mistral3": run_mistral3, "mllama": run_mllama, + "llama4": run_llama4, "molmo": run_molmo, - "nemotron_vl": run_nemotron_vl, "NVLM_D": run_nvlm_d, "ovis": run_ovis, "paligemma": run_paligemma, @@ -1430,7 +1401,6 @@ model_example_map = { "qwen2_5_omni": run_qwen2_5_omni, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, - "step3": run_step3, "tarsier": run_tarsier, "tarsier2": run_tarsier2, } diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 1ab405fa14f3a..dd50f3639709e 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -197,53 +197,6 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_hyperclovax_seed_vision( - question: str, image_urls: list[str] -) -> ModelRequestData: - model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=16384, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - message = {"role": "user", "content": list()} - for _image_url in image_urls: - message["content"].append( - { - "type": "image", - "image": _image_url, - "ocr": "", - "lens_keywords": "", - "lens_local_keywords": "", - } - ) - message["content"].append( - { - "type": "text", - "text": question, - } - ) - - prompt = tokenizer.apply_chat_template( - [ - message, - ], - tokenize=False, - add_generation_prompt=True, - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - stop_token_ids=None, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "HuggingFaceM4/Idefics3-8B-Llama3" @@ -272,6 +225,34 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + # The configuration below has been confirmed to launch on a single L40 GPU. + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={ + "max_image_size": {"longest_edge": 384}, + }, + ) + + placeholders = "\n".join( + f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) + ) + prompt = ( + f"<|im_start|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + ) + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -335,36 +316,49 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" +def load_hyperclovax_seed_vision( + question: str, image_urls: list[str] +) -> ModelRequestData: + model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) engine_args = EngineArgs( model=model_name, - max_model_len=131072, - tensor_parallel_size=8, + trust_remote_code=True, + max_model_len=16384, limit_mm_per_prompt={"image": len(image_urls)}, ) - placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [ + message = {"role": "user", "content": list()} + for _image_url in image_urls: + message["content"].append( + { + "type": "image", + "image": _image_url, + "ocr": "", + "lens_keywords": "", + "lens_local_keywords": "", + } + ) + message["content"].append( { - "role": "user", - "content": [ - *placeholders, - {"type": "text", "text": question}, - ], + "type": "text", + "text": question, } - ] + ) - processor = AutoProcessor.from_pretrained(model_name) - - prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True + prompt = tokenizer.apply_chat_template( + [ + message, + ], + tokenize=False, + add_generation_prompt=True, ) return ModelRequestData( engine_args=engine_args, prompt=prompt, + stop_token_ids=None, image_data=[fetch_image(url) for url in image_urls], ) @@ -469,6 +463,40 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa ) +def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=131072, + tensor_parallel_size=8, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "Kwai-Keye/Keye-VL-8B-Preview" @@ -926,62 +954,6 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" - - # The configuration below has been confirmed to launch on a single L40 GPU. - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=16, - enforce_eager=True, - limit_mm_per_prompt={"image": len(image_urls)}, - mm_processor_kwargs={ - "max_image_size": {"longest_edge": 384}, - }, - ) - - placeholders = "\n".join( - f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) - ) - prompt = ( - f"<|im_start|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 - ) - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - -def load_step3(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "stepfun-ai/step3-fp8" - - # NOTE: Below are verified configurations for step3-fp8 - # on 8xH100 GPUs. - engine_args = EngineArgs( - model=model_name, - max_num_batched_tokens=4096, - gpu_memory_utilization=0.85, - tensor_parallel_size=8, - limit_mm_per_prompt={"image": len(image_urls)}, - reasoning_parser="step3", - ) - - prompt = ( - "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n " - f"{'' * len(image_urls)}{question} <|EOT|><|BOT|" - ">assistant\n\n" - ) - image_data = [fetch_image(url) for url in image_urls] - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=image_data, - ) - - def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "omni-research/Tarsier-7b" @@ -1034,16 +1006,16 @@ model_example_map = { "deepseek_vl_v2": load_deepseek_vl2, "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, - "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "idefics3": load_idefics3, "interns1": load_interns1, "internvl_chat": load_internvl, + "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "keye_vl": load_keye_vl, "kimi_vl": load_kimi_vl, - "llama4": load_llama4, "llava": load_llava, "llava-next": load_llava_next, "llava-onevision": load_llava_onevision, + "llama4": load_llama4, "mistral3": load_mistral3, "mllama": load_mllama, "NVLM_D": load_nvlm_d, @@ -1056,7 +1028,6 @@ model_example_map = { "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, "smolvlm": load_smolvlm, - "step3": load_step3, "tarsier": load_tarsier, "tarsier2": load_tarsier2, } diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index d39edb0b9d15c..16c32dcaa5d31 100644 --- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -293,8 +293,6 @@ class Proxy: # add params to request kv_prepare_request = request.copy() kv_prepare_request["max_tokens"] = 1 - if "max_completion_tokens" in kv_prepare_request: - kv_prepare_request["max_completion_tokens"] = 1 # prefill stage prefill_instance = self.schedule(self.prefill_cycler) diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py index 73da7af85f1d9..a6fd92feb2f11 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py @@ -128,8 +128,6 @@ async def handle_request(): prefill_request = original_request_data.copy() # change max_tokens = 1 to let it only do prefill prefill_request["max_tokens"] = 1 - if "max_completion_tokens" in prefill_request: - prefill_request["max_completion_tokens"] = 1 global count global prefill_instances diff --git a/requirements/common.txt b/requirements/common.txt index 6b57a3d2f1d0d..d29b3e59d35b2 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -13,7 +13,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp -openai >= 1.87.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support) +openai >= 1.87.0, <= 1.90.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support) pydantic >= 2.10 prometheus_client >= 0.18.0 pillow # Required for image processing diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 75008dc20df48..5557c868acafa 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -11,4 +11,6 @@ torchaudio==2.7.1 # These must be updated alongside torch torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # https://github.com/facebookresearch/xformers/releases/tag/v0.0.31 -xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 \ No newline at end of file +xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 +# FlashInfer should be updated together with the Dockerfile +flashinfer_python==0.2.9rc2 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index d45048aae5809..4aaca2afea266 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -22,9 +22,7 @@ aiohttp==3.10.11 aiohttp-cors==0.8.1 # via ray aiosignal==1.3.1 - # via - # aiohttp - # ray + # via aiohttp albucore==0.0.16 # via terratorch albumentations==1.4.6 @@ -139,7 +137,7 @@ contourpy==1.3.0 # via matplotlib cramjam==2.9.0 # via fastparquet -cupy-cuda12x==13.3.0 +cupy-cuda12x==13.5.1 # via ray cycler==0.12.1 # via matplotlib @@ -226,7 +224,6 @@ frozenlist==1.5.0 # via # aiohttp # aiosignal - # ray fsspec==2024.9.0 # via # datasets @@ -603,10 +600,18 @@ opencv-python-headless==4.11.0.86 opentelemetry-api==1.35.0 # via # mlflow-skinny + # opentelemetry-exporter-prometheus # opentelemetry-sdk # opentelemetry-semantic-conventions +opentelemetry-exporter-prometheus==0.56b0 + # via ray +opentelemetry-proto==1.36.0 + # via ray opentelemetry-sdk==1.35.0 - # via mlflow-skinny + # via + # mlflow-skinny + # opentelemetry-exporter-prometheus + # ray opentelemetry-semantic-conventions==0.56b0 # via opentelemetry-sdk packaging==24.2 @@ -697,7 +702,9 @@ pqdm==0.2.0 pretrainedmodels==0.7.4 # via segmentation-models-pytorch prometheus-client==0.22.0 - # via ray + # via + # opentelemetry-exporter-prometheus + # ray propcache==0.2.0 # via yarl proto-plus==1.26.1 @@ -707,6 +714,7 @@ protobuf==5.28.3 # google-api-core # googleapis-common-protos # mlflow-skinny + # opentelemetry-proto # proto-plus # ray # tensorboardx @@ -854,7 +862,7 @@ rasterio==1.4.3 # rioxarray # terratorch # torchgeo -ray==2.43.0 +ray==2.48.0 # via -r requirements/test.in redis==5.2.0 # via tensorizer diff --git a/setup.py b/setup.py index 64cfbb8db962b..6d615d122d69e 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ import json import logging import os import re +import shutil import subprocess import sys from pathlib import Path @@ -281,10 +282,69 @@ class cmake_build_ext(build_ext): self.copy_file(file, dst_file) -class repackage_wheel(build_ext): +class precompiled_wheel_utils: """Extracts libraries and other files from an existing wheel.""" - def get_base_commit_in_main_branch(self) -> str: + @staticmethod + def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict: + import tempfile + import zipfile + + temp_dir = None + try: + if not os.path.isfile(wheel_url_or_path): + wheel_filename = wheel_url_or_path.split("/")[-1] + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + print(f"Downloading wheel from {wheel_url_or_path} " + f"to {wheel_path}") + from urllib.request import urlretrieve + urlretrieve(wheel_url_or_path, filename=wheel_path) + else: + wheel_path = wheel_url_or_path + print(f"Using existing wheel at {wheel_path}") + + package_data_patch = {} + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/_flashmla_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", + "vllm/cumem_allocator.abi3.so", + ] + + compiled_regex = re.compile( + r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") + file_members = list( + filter(lambda x: x.filename in files_to_copy, + wheel.filelist)) + file_members += list( + filter(lambda x: compiled_regex.match(x.filename), + wheel.filelist)) + + for file in file_members: + print(f"[extract] {file.filename}") + target_path = os.path.join(".", file.filename) + os.makedirs(os.path.dirname(target_path), exist_ok=True) + with wheel.open(file.filename) as src, open( + target_path, "wb") as dst: + shutil.copyfileobj(src, dst) + + pkg = os.path.dirname(file.filename).replace("/", ".") + package_data_patch.setdefault(pkg, []).append( + os.path.basename(file.filename)) + + return package_data_patch + finally: + if temp_dir is not None: + print(f"Removing temporary directory {temp_dir}") + shutil.rmtree(temp_dir) + + @staticmethod + def get_base_commit_in_main_branch() -> str: # Force to use the nightly wheel. This is mainly used for CI testing. if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: return "nightly" @@ -297,6 +357,10 @@ class repackage_wheel(build_ext): ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] + # In Docker build context, .git may be immutable or missing. + if envs.VLLM_DOCKER_BUILD_CONTEXT: + return upstream_main_commit + # Check if the upstream_main_commit exists in the local repo try: subprocess.check_output( @@ -329,92 +393,15 @@ class repackage_wheel(build_ext): "wheel may not be compatible with your dev branch: %s", err) return "nightly" - def run(self) -> None: - assert _is_cuda( - ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" - - wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) - if wheel_location is None: - base_commit = self.get_base_commit_in_main_branch() - wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - # Fallback to nightly wheel if latest commit wheel is unavailable, - # in this rare case, the nightly release CI hasn't finished on main. - if not is_url_available(wheel_location): - wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - import zipfile - - if os.path.isfile(wheel_location): - wheel_path = wheel_location - print(f"Using existing wheel={wheel_path}") - else: - # Download the wheel from a given URL, assume - # the filename is the last part of the URL - wheel_filename = wheel_location.split("/")[-1] - - import tempfile - - # create a temporary directory to store the wheel - temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") - wheel_path = os.path.join(temp_dir, wheel_filename) - - print(f"Downloading wheel from {wheel_location} to {wheel_path}") - - from urllib.request import urlretrieve - - try: - urlretrieve(wheel_location, filename=wheel_path) - except Exception as e: - from setuptools.errors import SetupError - - raise SetupError( - f"Failed to get vLLM wheel from {wheel_location}") from e - - with zipfile.ZipFile(wheel_path) as wheel: - files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/_flashmla_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", - "vllm/cumem_allocator.abi3.so", - # "vllm/_version.py", # not available in nightly wheels yet - ] - - file_members = list( - filter(lambda x: x.filename in files_to_copy, wheel.filelist)) - - # vllm_flash_attn python code: - # Regex from - # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` - compiled_regex = re.compile( - r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") - file_members += list( - filter(lambda x: compiled_regex.match(x.filename), - wheel.filelist)) - - for file in file_members: - print(f"Extracting and including {file.filename} " - "from existing wheel") - package_name = os.path.dirname(file.filename).replace("/", ".") - file_name = os.path.basename(file.filename) - - if package_name not in package_data: - package_data[package_name] = [] - - wheel.extract(file) - if file_name.endswith(".py"): - # python files shouldn't be added to package_data - continue - - package_data[package_name].append(file_name) - def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" def _is_cuda() -> bool: + # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda + if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT: + return True has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not (_is_neuron() or _is_tpu())) @@ -639,16 +626,37 @@ package_data = { ] } +# If using precompiled, extract and patch package_data (in advance of setup) +if envs.VLLM_USE_PRECOMPILED: + assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) + if wheel_location is not None: + wheel_url = wheel_location + else: + base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() + wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + from urllib.request import urlopen + try: + with urlopen(wheel_url) as resp: + if resp.status != 200: + wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + except Exception as e: + print(f"[warn] Falling back to nightly wheel: {e}") + wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( + wheel_url) + for pkg, files in patch.items(): + package_data.setdefault(pkg, []).extend(files) + if _no_device(): ext_modules = [] -if not ext_modules: +if not ext_modules or envs.VLLM_USE_PRECOMPILED: + # Disable build_ext when using precompiled wheel cmdclass = {} else: - cmdclass = { - "build_ext": - repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext - } + cmdclass = {"build_ext": cmake_build_ext} setup( # static metadata should rather go in pyproject.toml @@ -663,9 +671,7 @@ setup( ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"], "audio": ["librosa", "soundfile", "mistral_common[audio]"], # Required for audio processing - "video": [], # Kept for backwards compatibility - # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.9rc2"], + "video": [] # Kept for backwards compatibility }, cmdclass=cmdclass, package_data=package_data, diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 4c3cf6c2a10cf..b394e0035c689 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -136,15 +136,12 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module): @multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize( - "test_model", - [ - TestAllReduceRMSNormModel, - TestAllReduceFusedAddRMSNormModel, - TestAllReduceFusedAddRMSNormStaticQuantFP8Model, - # TODO: Enable with torch==2.8.0 - # TestAllReduceFusedAddRMSNormStaticQuantFP4Model, - ]) +@pytest.mark.parametrize("test_model", [ + TestAllReduceRMSNormModel, + TestAllReduceFusedAddRMSNormModel, + TestAllReduceFusedAddRMSNormStaticQuantFP8Model, + TestAllReduceFusedAddRMSNormStaticQuantFP4Model, +]) @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seq_len", [8]) @pytest.mark.parametrize("hidden_size", [16]) diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py index d4d4be293280b..ee351cbfa7c16 100644 --- a/tests/config/test_mp_reducer.py +++ b/tests/config/test_mp_reducer.py @@ -28,6 +28,7 @@ def test_mp_reducer(monkeypatch): max_model_len=32, gpu_memory_utilization=0.1, disable_log_stats=True, + disable_log_requests=True, ) async_llm = AsyncLLM.from_engine_args( diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index c282bf002304a..1d1926068d28c 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -5,7 +5,7 @@ import json from argparse import ArgumentError from contextlib import nullcontext from dataclasses import dataclass, field -from typing import Annotated, Literal, Optional, Union +from typing import Annotated, Literal, Optional import pytest @@ -136,8 +136,6 @@ class DummyConfig: """List with variable length""" list_literal: list[Literal[1, 2]] = field(default_factory=list) """List with literal choices""" - list_union: list[Union[str, type[object]]] = field(default_factory=list) - """List with union type""" literal_literal: Literal[Literal[1], Literal[2]] = 1 """Literal of literals with default 1""" json_tip: dict = field(default_factory=dict) @@ -189,9 +187,6 @@ def test_get_kwargs(): assert kwargs["list_literal"]["type"] is int assert kwargs["list_literal"]["nargs"] == "+" assert kwargs["list_literal"]["choices"] == [1, 2] - # lists with unions should become str type. - # If not, we cannot know which type to use for parsing - assert kwargs["list_union"]["type"] is str # literals of literals should have merged choices assert kwargs["literal_literal"]["choices"] == [1, 2] # dict should have json tip in help diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py index 866f52882beee..09bb4a34f3189 100644 --- a/tests/kernels/moe/modular_kernel_tools/utils.py +++ b/tests/kernels/moe/modular_kernel_tools/utils.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math import torch import vllm._custom_ops as ops -from vllm.utils.deep_gemm import per_block_cast_to_fp8 def per_token_cast_to_fp8( @@ -20,6 +20,29 @@ def per_token_cast_to_fp8( return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) +def per_block_cast_to_fp8( + x: torch.Tensor, block_size_k: int, + block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros( + ( + int(math.ceil(m / block_size_k)) * block_size_k, + int(math.ceil(n / block_size_n)) * block_size_n, + ), + dtype=x.dtype, + device=x.device, + ) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, block_size_k, + x_padded.size(1) // block_size_k, block_size_n) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) + x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous() + scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + return x_scaled_sub, scales + + def make_non_quant_weights( e: int, n: int, @@ -76,9 +99,11 @@ def make_block_quant_fp8_weights( for i in range(e): w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i], - block_size=[block_k, block_n]) + block_size_k=block_k, + block_size_n=block_n) w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i], - block_size=[block_k, block_n]) + block_size_k=block_k, + block_size_n=block_n) return w1, w2, w1_s, w2_s diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py deleted file mode 100644 index 018d4c224f75e..0000000000000 --- a/tests/kernels/moe/test_batched_deepgemm.py +++ /dev/null @@ -1,103 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - BatchedDeepGemmExperts) -from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( - BatchedPrepareAndFinalize, BatchedTritonExperts) -from vllm.model_executor.layers.fused_moe.modular_kernel import ( - FusedMoEModularKernel) -from vllm.utils.deep_gemm import calc_diff, is_deep_gemm_supported - -from .test_deepgemm import make_block_quant_fp8_weights - -BLOCK_SIZE = [128, 128] - - -@pytest.mark.skipif(not is_deep_gemm_supported(), - reason="Requires deep_gemm kernels") -@pytest.mark.parametrize("E", [16, 32]) # number of experts -@pytest.mark.parametrize("T", [256, 512]) # tokens per expert -@pytest.mark.parametrize("K", [128, 256]) # hidden dim -@pytest.mark.parametrize("N", [512, 1024]) # intermediate dim per expert -@pytest.mark.parametrize("topk", [2, 4]) -def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int, - monkeypatch): - """Compare BatchedDeepGemmExperts to BatchedTritonExperts.""" - - monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1") - - device = "cuda" - w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(E, N, K, BLOCK_SIZE) - - M = E * T # total tokens - a = torch.randn(M, K, device=device, dtype=torch.bfloat16) / 10.0 - fp8_info = torch.finfo(torch.float8_e4m3fn) - a.clamp_(fp8_info.min, fp8_info.max) - - # random router outputs → top-k indices / weights - router_logits = torch.randn(M, E, device=device, dtype=torch.float32) - topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1) - topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1) - - # token number for each expert - cnt = torch.bincount(topk_ids.flatten(), minlength=E) - max_cnt = int(cnt.max().item()) - # next power of 2 for max token number - max_num_tokens = 1 << (max_cnt - 1).bit_length() - - prep_finalize = BatchedPrepareAndFinalize( - max_num_tokens=max_num_tokens, - num_local_experts=E, - num_dispatchers=1, - rank=0, - ) - - # triton (reference) - triton_experts = BatchedTritonExperts( - max_num_tokens=max_num_tokens, - num_dispatchers=1, - use_fp8_w8a8=True, - per_act_token_quant=False, - block_shape=BLOCK_SIZE, - ) - mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts) - - out_triton = mk_triton( - hidden_states=a, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=False, - w1_scale=w1_s, - w2_scale=w2_s, - global_num_experts=E, - ) - - # deepgemm - deepgemm_experts = BatchedDeepGemmExperts( - max_num_tokens=max_num_tokens, - num_dispatchers=1, - block_shape=BLOCK_SIZE, - per_act_token_quant=False, - ) - mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts) - - out_deepgemm = mk_deepgemm( - hidden_states=a, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=False, - w1_scale=w1_s, - w2_scale=w2_s, - global_num_experts=E, - ) - - diff = calc_diff(out_deepgemm, out_triton) - assert diff < 1e-3, f"Output diff too large: {diff}" diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py index 1aee1ed8c3762..67984fe7319a3 100644 --- a/tests/kernels/moe/test_cutlass_grouped_gemm.py +++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py @@ -12,8 +12,10 @@ import torch from tests.kernels.utils import baseline_scaled_mm from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import cdiv -from vllm.utils.deep_gemm import per_block_cast_to_fp8 + + +def cdiv(a, b): + return (a + b - 1) // b def per_token_cast_to_fp8( @@ -30,6 +32,21 @@ def per_token_cast_to_fp8( return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) +def per_block_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128), + device=x.device, + dtype=x.dtype) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( + x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + + @pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [ (4, 8192, 7168, 4096), (4, 8192, 2048, 7168), diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 266f1161a684b..074771e49a061 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -20,8 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) from vllm.platforms import current_platform from vllm.utils import has_deep_ep, has_deep_gemm -from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used, - is_deep_gemm_supported) +from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used from .parallel_utils import ProcessGroupInfo, parallel_launch from .utils import make_test_weights @@ -47,7 +46,7 @@ requires_deep_ep = pytest.mark.skipif( ) requires_deep_gemm = pytest.mark.skipif( - not is_deep_gemm_supported(), + not has_deep_gemm(), reason="Requires deep_gemm kernels", ) diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index b2b78662c9ded..f7578e226917d 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -15,11 +15,16 @@ import torch from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) -from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported, - per_block_cast_to_fp8) +from vllm.utils import has_deep_gemm +from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8 BLOCK_SIZE = [128, 128] +requires_deep_gemm = pytest.mark.skipif( + not has_deep_gemm(), + reason="Requires deep_gemm kernels", +) + def make_block_quant_fp8_weights( e: int, @@ -64,12 +69,8 @@ def make_block_quant_fp8_weights( dtype=torch.float32) for i in range(e): - w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i], - block_size=block_size, - use_ue8m0=True) - w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i], - block_size=block_size, - use_ue8m0=True) + w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i]) + w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i]) return w1, w2, w1_s, w2_s @@ -147,8 +148,7 @@ NUM_EXPERTS = [32] @pytest.mark.parametrize("mnk", MNKs) @pytest.mark.parametrize("topk", TOPKS) @pytest.mark.parametrize("num_experts", NUM_EXPERTS) -@pytest.mark.skipif(not is_deep_gemm_supported(), - reason="Requires deep_gemm kernels") +@requires_deep_gemm def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch): with monkeypatch.context() as m: diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index c33134981acc0..df89ad7e6da6f 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -5,7 +5,8 @@ from typing import Optional import torch import vllm._custom_ops as ops -from tests.kernels.quant_utils import per_block_cast_to_int8 +from tests.kernels.quant_utils import (per_block_cast_to_fp8, + per_block_cast_to_int8) from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts) @@ -14,7 +15,6 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input) from vllm.utils import round_up -from vllm.utils.deep_gemm import per_block_cast_to_fp8 def triton_moe( diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 01a1ad2e7a0a5..6f43d1111c98e 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -222,6 +222,25 @@ def native_per_token_group_quant_int8(x, DEFAULT_BLOCK_SHAPE = [128, 128] +def per_block_cast_to_fp8( + x: torch.Tensor, + block_shape: list[int] = DEFAULT_BLOCK_SHAPE, +) -> tuple[torch.Tensor, torch.Tensor]: + block_m, block_n = block_shape + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) + x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous() + scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + return x_scaled_sub, scales + + def per_block_cast_to_int8( x: torch.Tensor, block_shape: list[int] = DEFAULT_BLOCK_SHAPE, diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index d9154d3fd7f33..26aa8d652e639 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -117,7 +117,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1]) - B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size) + B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32) As = As_fp8.to(torch.float32) Bs = Bs_fp8.to(torch.float32) diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 8730eeaaa761c..544e6dc197904 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -559,6 +559,8 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, m_a_scales = m_g if per_act_token else 1 n_b_scales = n_g if per_out_ch else 1 + print("shape:", m_g, n_g, k_g) + # Create group-specific A and B (FP8) and output (FP16/FP32) a_g = to_fp8(torch.randn((m_g, k_g), device=device)) b_g = to_fp8(torch.randn((n_g, k_g), device=device).t()) @@ -637,4 +639,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, for g in range(num_experts): baseline = baseline_tensors[g] c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]] + print(baseline) + print(c) + print("*") torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4) diff --git a/tests/lora/test_multi_loras_with_tp.py b/tests/lora/test_multi_loras_with_tp.py deleted file mode 100644 index fe9bd3f269515..0000000000000 --- a/tests/lora/test_multi_loras_with_tp.py +++ /dev/null @@ -1,158 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Script to test multi loras service with tp >= 2 -""" -from tests.utils import multi_gpu_test -from vllm import LLM, SamplingParams -from vllm.lora.request import LoRARequest - -MODEL_PATH = "Qwen/Qwen3-0.6B" -LORA_NAME_PATH_MAP = { - "Alice": "charent/self_cognition_Alice", - "Bob": "charent/self_cognition_Bob", - "Cat": "charent/self_cognition_Bob", # same as Bob -} - -LORA_NAME_ID_MAP = {} -INCREASE_LORA_ID = 0 -LORA_RANK = 8 - -LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"] -LORA_TEST_EXPECTED = [ - "GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501 - "I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501 -] - - -def format_chatml_messages(prompt: str): - return [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": prompt - }, - ] - - -def make_add_lora_request(name: str, path: str): - global INCREASE_LORA_ID, LORA_NAME_ID_MAP - - INCREASE_LORA_ID += 1 - LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID - - return LoRARequest( - lora_name=name, - lora_int_id=INCREASE_LORA_ID, - lora_path=path, - ) - - -@multi_gpu_test(num_gpus=2) -def test_multi_loras_with_tp_sync(): - - llm = LLM( - model=MODEL_PATH, - enable_lora=True, - max_loras=2, # ensure max_loras < max_cpu_loras - max_lora_rank=LORA_RANK, - max_model_len=512, - gpu_memory_utilization=0.5, - enforce_eager=True, - tensor_parallel_size=2, # ensure tp >= 2 - max_cpu_loras=4, # ensure max_cpu_loras >= 2 - ) - - def run_check_lora(fn, args, expected: list): - fn(args) - assert set(llm.llm_engine.list_loras()) == set(expected) - - # simulate add loras with CLI args - # likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob` - run_check_lora( - llm.llm_engine.add_lora, - make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]), - [1], - ) - run_check_lora( - llm.llm_engine.add_lora, - make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]), - [1, 2], - ) - run_check_lora( - llm.llm_engine.add_lora, - make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]), - [1, 2, 3], - ) - - # set temperature = 0 for greedy search - sampling_params = SamplingParams(temperature=0, max_tokens=64) - - def call_llm_get_outputs(prompt: str, lora_name: str): - lora_request = LoRARequest( - lora_name=lora_name, - lora_int_id=LORA_NAME_ID_MAP[lora_name], - lora_path=LORA_NAME_PATH_MAP[lora_name], - ) - messages = format_chatml_messages(prompt) - outputs = llm.chat( - [messages], - sampling_params, - chat_template_kwargs={ - "enable_thinking": False - }, # for those loras, ensure enable_thinking=False - lora_request=lora_request, - use_tqdm=False, - ) - output_text = outputs[0].outputs[0].text - return output_text - - def reload_lora(name: str): - """ - reload a lora to simulate the case: - setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` - for dynamic lora loading and unloading - """ - remove_lora_response = llm.llm_engine.remove_lora( - lora_id=LORA_NAME_ID_MAP[name]) - - add_lora_response = llm.llm_engine.add_lora( - make_add_lora_request(name, LORA_NAME_PATH_MAP[name])) - - print(f"{remove_lora_response=}, {add_lora_response=}") - - def check_outputs(outputs: str, expected: str): - print(f"{prompt=}.\n{expected_output=}\n{output_text=}") - print("\n----------------------------\n") - assert outputs == expected - - for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED): - - output_text = call_llm_get_outputs(prompt, "Alice") - check_outputs(output_text, expected_output) - - # call Bob, ignore what it is output - call_llm_get_outputs(prompt, "Bob") - print("After call Bob:") - - # call Alice - output_text = call_llm_get_outputs(prompt, "Alice") - check_outputs(output_text, expected_output) - - # reload Bob Lora - reload_lora("Bob") - print("After reload Bob:") - - # call Alice - output_text = call_llm_get_outputs(prompt, "Alice") - check_outputs(output_text, expected_output) - - # reload Alice Lora - reload_lora("Alice") - print("After reload Alice:") - - output_text = call_llm_get_outputs(prompt, "Alice") - check_outputs(output_text, expected_output) diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 76f3bc0ebf89f..604bb307b889d 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -4,6 +4,8 @@ from dataclasses import dataclass from typing import Optional import pytest +from packaging.version import Version +from transformers import __version__ as TRANSFORMERS_VERSION import vllm from vllm.assets.image import ImageAsset @@ -183,6 +185,10 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files): current_platform.is_rocm(), reason="Qwen2.5-VL dependency xformers incompatible with ROCm", ) +@pytest.mark.skipif( + Version(TRANSFORMERS_VERSION) < Version("4.49.0"), + reason="Qwen2.5-VL require transformers version no lower than 4.49.0", +) def test_qwen25vl_lora(qwen25vl_lora_files): """Test Qwen 2.5 VL model with LoRA""" config = TestConfig(model_path=QWEN25VL_MODEL_PATH, diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 967228b54a0af..5bff615fb1071 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -702,38 +702,13 @@ VLM_TEST_SETTINGS = { "smolvlm": VLMTestInfo( models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}\nAssistant:", # noqa: E501 + prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, hf_output_post_proc=model_utils.smolvlm_trunc_hf_output, ), - "tarsier": VLMTestInfo( - models=["omni-research/Tarsier-7b"], - test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:", - max_model_len=4096, - max_num_seqs=2, - auto_cls=AutoModelForImageTextToText, - patch_hf_runner=model_utils.tarsier_patch_hf_runner, - ), - "tarsier2": VLMTestInfo( - models=["omni-research/Tarsier2-Recap-7b"], - test_type=( - VLMTestType.IMAGE, - VLMTestType.MULTI_IMAGE, - VLMTestType.VIDEO, - ), - prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 - img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 - video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 - max_model_len=4096, - max_num_seqs=2, - auto_cls=AutoModelForImageTextToText, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], - marks=[pytest.mark.skip("Model initialization hangs")], - ), ### Tensor parallel / multi-gpu broadcast tests "chameleon-broadcast": VLMTestInfo( models=["facebook/chameleon-7b"], diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 5e8dac6bce96a..c1a2aa0dcafbb 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -818,15 +818,3 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner: thinker.get_output_embeddings = lambda: thinker.lm_head hf_model.model = thinker return hf_model - - -def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner: - from vllm.model_executor.models.tarsier import get_vision_encoder_info - - vision_encoder_info = get_vision_encoder_info(hf_model.config) - - hf_processor = hf_model.processor - if hf_processor.patch_size is None: - hf_processor.patch_size = vision_encoder_info.get_patch_size() - - return hf_model diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py deleted file mode 100644 index d1c5fa8fec6d2..0000000000000 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ /dev/null @@ -1,51 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.assets.video import VideoAsset -from vllm.multimodal import MULTIMODAL_REGISTRY - -from ...utils import build_model_context - - -@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"]) -@pytest.mark.parametrize("expected_toks_per_frame", [299]) -@pytest.mark.parametrize("num_frames", [32, 128]) -@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)]) -def test_processor_override( - model_id: str, - expected_toks_per_frame: int, - expected_grid_t: int, - fps: int, - num_frames: int, -): - """Ensure GLM4vMultiModalProcessor can handle video frames properly.""" - ctx = build_model_context( - model_id, - mm_processor_kwargs=None, - limit_mm_per_prompt={"video": 1}, - ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) - tokenizer = processor.info.get_tokenizer() - hf_processor_mm_kwargs = {"fps": fps} - - # Build the image str / prompt based on the number of images we pass - video_assets = VideoAsset(name="baby_reading", num_frames=num_frames) - prompt = "<|begin_of_video|><|video|><|end_of_video|>" - - video, metadata = video_assets.np_ndarrays, video_assets.metadata - metadata["fps"] = fps - mm_data = {"video": [(video, metadata)]} - - processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Ensure we have the right number of placeholders per num_crops size - hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) - video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token) - video_tok_count = processed_inputs["prompt_token_ids"].count( - video_token_id) - grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0] - - assert grid_t == expected_grid_t - assert video_tok_count == expected_toks_per_frame * grid_t diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py index 54a0be99384a8..c7d1b5271ff72 100644 --- a/tests/models/multimodal/processing/test_transformers.py +++ b/tests/models/multimodal/processing/test_transformers.py @@ -16,7 +16,7 @@ def test_multimodal_processor(model_id): model_impl="transformers", ) - mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config) + mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, ) image_pil = ImageAsset('cherry_blossom').pil_image mm_data = {"image": image_pil} diff --git a/tests/models/registry.py b/tests/models/registry.py index fdc7888c85efb..b9e7de4e9fd11 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -465,7 +465,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { is_available_online=False), "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), - "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501 + "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501 + hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501 "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501 hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501 "VoxtralForConditionalGeneration": _HfExamplesInfo( @@ -525,7 +526,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { } _TRANSFORMERS_BACKEND_MODELS = { - "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"), "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501 "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), } diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 66ff8f7a54d31..5b7d90dfb896d 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -34,7 +34,8 @@ def check_implementation( with runner_test(model, **kwargs_test, **kwargs) as model_test: model_config = model_test.llm.llm_engine.model_config - assert model_config.using_transformers_backend() + assert model_config.architecture == ( + model_config._get_transformers_backend_cls()) outputs_test = model_test.generate_greedy_logprobs(*args) @@ -134,7 +135,8 @@ def test_quantization( enforce_eager=True, **quantization_kwargs) as vllm_model: # type: ignore[arg-type] model_config = vllm_model.llm.llm_engine.model_config - assert model_config.using_transformers_backend() + assert model_config.architecture == ( + model_config._get_transformers_backend_cls()) transformers_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs) @@ -147,25 +149,6 @@ def test_quantization( ) -@pytest.mark.parametrize( - "model", - [ - # Layers live in `layers` - "Qwen/Qwen3-Embedding-0.6B", - # Layers live in `model.layers` - "meta-llama/Llama-3.2-1B-Instruct" - ], -) -def test_embed_loading(vllm_runner, model): - with vllm_runner(model, - max_model_len=1024, - enforce_eager=True, - runner="pooling", - model_impl="transformers") as model_test: - model_config = model_test.llm.llm_engine.model_config - assert model_config.using_transformers_backend() - - @pytest.mark.parametrize( "model", ["jason9693/Qwen2.5-1.5B-apeach"], @@ -186,7 +169,8 @@ def test_classify( dtype=dtype, model_impl="transformers") as vllm_model: model_config = vllm_model.llm.llm_engine.model_config - assert model_config.using_transformers_backend() + assert model_config.architecture == ( + model_config._get_transformers_backend_cls()) vllm_outputs = vllm_model.classify(example_prompts) diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py index c934706611ae3..e9fd5b814f285 100644 --- a/tests/mq_llm_engine/test_load.py +++ b/tests/mq_llm_engine/test_load.py @@ -16,7 +16,7 @@ NUM_EXPECTED_TOKENS = 10 NUM_REQUESTS = 10000 # Scenarios to test for num generated token. -ENGINE_ARGS = AsyncEngineArgs(model=MODEL) +ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True) @pytest.fixture(scope="function") diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py index 271a85f1195ec..cfd44351a6d1f 100644 --- a/tests/multimodal/test_image.py +++ b/tests/multimodal/test_image.py @@ -3,10 +3,9 @@ from pathlib import Path import numpy as np -import pytest from PIL import Image, ImageChops -from vllm.multimodal.image import ImageMediaIO, convert_image_mode +from vllm.multimodal.image import convert_image_mode ASSETS_DIR = Path(__file__).parent / "assets" assert ASSETS_DIR.exists() @@ -36,115 +35,3 @@ def test_rgba_to_rgb(): assert converted_image_numpy[i][j][0] == 255 assert converted_image_numpy[i][j][1] == 255 assert converted_image_numpy[i][j][2] == 255 - - -def test_rgba_to_rgb_custom_background(tmp_path): - """Test RGBA to RGB conversion with custom background colors.""" - # Create a simple RGBA image with transparent and opaque pixels - rgba_image = Image.new("RGBA", (10, 10), - (255, 0, 0, 255)) # Red with full opacity - - # Make top-left quadrant transparent - for i in range(5): - for j in range(5): - rgba_image.putpixel((i, j), (0, 0, 0, 0)) # Fully transparent - - # Save the test image to tmp_path - test_image_path = tmp_path / "test_rgba.png" - rgba_image.save(test_image_path) - - # Test 1: Default white background (backward compatibility) - image_io_default = ImageMediaIO() - converted_default = image_io_default.load_file(test_image_path) - default_numpy = np.array(converted_default) - - # Check transparent pixels are white - assert default_numpy[0][0][0] == 255 # R - assert default_numpy[0][0][1] == 255 # G - assert default_numpy[0][0][2] == 255 # B - # Check opaque pixels remain red - assert default_numpy[5][5][0] == 255 # R - assert default_numpy[5][5][1] == 0 # G - assert default_numpy[5][5][2] == 0 # B - - # Test 2: Custom black background via kwargs - image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0)) - converted_black = image_io_black.load_file(test_image_path) - black_numpy = np.array(converted_black) - - # Check transparent pixels are black - assert black_numpy[0][0][0] == 0 # R - assert black_numpy[0][0][1] == 0 # G - assert black_numpy[0][0][2] == 0 # B - # Check opaque pixels remain red - assert black_numpy[5][5][0] == 255 # R - assert black_numpy[5][5][1] == 0 # G - assert black_numpy[5][5][2] == 0 # B - - # Test 3: Custom blue background via kwargs (as list) - image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255]) - converted_blue = image_io_blue.load_file(test_image_path) - blue_numpy = np.array(converted_blue) - - # Check transparent pixels are blue - assert blue_numpy[0][0][0] == 0 # R - assert blue_numpy[0][0][1] == 0 # G - assert blue_numpy[0][0][2] == 255 # B - - # Test 4: Test with load_bytes method - with open(test_image_path, 'rb') as f: - image_data = f.read() - - image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0)) - converted_green = image_io_green.load_bytes(image_data) - green_numpy = np.array(converted_green) - - # Check transparent pixels are green - assert green_numpy[0][0][0] == 0 # R - assert green_numpy[0][0][1] == 255 # G - assert green_numpy[0][0][2] == 0 # B - - -def test_rgba_background_color_validation(): - """Test that invalid rgba_background_color values are properly rejected.""" - - # Test invalid types - with pytest.raises(ValueError, - match="rgba_background_color must be a list or tuple"): - ImageMediaIO(rgba_background_color="255,255,255") - - with pytest.raises(ValueError, - match="rgba_background_color must be a list or tuple"): - ImageMediaIO(rgba_background_color=255) - - # Test wrong number of elements - with pytest.raises(ValueError, - match="rgba_background_color must be a list or tuple"): - ImageMediaIO(rgba_background_color=(255, 255)) - - with pytest.raises(ValueError, - match="rgba_background_color must be a list or tuple"): - ImageMediaIO(rgba_background_color=(255, 255, 255, 255)) - - # Test non-integer values - with pytest.raises(ValueError, - match="rgba_background_color must be a list or tuple"): - ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0)) - - with pytest.raises(ValueError, - match="rgba_background_color must be a list or tuple"): - ImageMediaIO(rgba_background_color=(255, "255", 255)) - - # Test out of range values - with pytest.raises(ValueError, - match="rgba_background_color must be a list or tuple"): - ImageMediaIO(rgba_background_color=(256, 255, 255)) - - with pytest.raises(ValueError, - match="rgba_background_color must be a list or tuple"): - ImageMediaIO(rgba_background_color=(255, -1, 255)) - - # Test that valid values work - ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise - ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise - ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 659ee9af9ddec..8a3f09bdbe27e 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -2,15 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import nullcontext -from typing import Optional, cast +from types import MethodType +from typing import cast from unittest.mock import MagicMock import numpy as np import pytest import torch +from transformers import ProcessorMixin from vllm.config import ModelConfig -from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, MultiModalKwargsItem, @@ -1012,91 +1013,57 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ) -class DummyProcessor: +class _ProcessorProxy: - def __init__(self, a: int = 0, b: int = 0) -> None: + def __init__(self, processor: ProcessorMixin) -> None: super().__init__() - self.a = a - self.b = b + self.__processor = processor + + def __getattr__(self, key: str): + return getattr(self.__processor, key) def __call__( self, - a: int = 0, - c: int = 0, - return_tensors: Optional[str] = None, - ) -> dict[str, int]: - return dict(a=a, c=c) + text=None, + images=None, + videos=None, + exists=None, + return_tensors=None, + ): + return dict(exists=exists) -# yapf: disable @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy +# yapf: disable @pytest.mark.parametrize( - ("config_kwargs", "inference_kwargs", "expected_kwargs"), + ("call_kwargs", "expected_kwargs"), [ - ({"a": 1}, {}, {"a": 1, "b": 0}), - ({}, {"a": 1}, {"a": 1, "b": 0}), - # inference_kwargs should take precedence - ({"a": 1}, {"a": 2}, {"a": 2, "b": 0}), - # Should ignore extra kwargs - ({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}), - ({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}), + # Should ignore invalid kwargs + ({"does_not_exist": 100}, {"exists": None}), + ({"exists": 1}, {"exists": 1}), + ({"does_not_exist": 100, "exists": 1}, {"exists": 1}), ], ) # yapf: enable -def test_hf_processor_init_kwargs( - model_id, - config_kwargs, - inference_kwargs, - expected_kwargs, -): - # Should not be used since there is nothing to convert to tokens - mock_tokenizer = cast(AnyTokenizer, object()) +def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): + model_config = ModelConfig(model_id) - ctx = InputProcessingContext( - model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), - tokenizer=mock_tokenizer, + processor = MULTIMODAL_REGISTRY.create_processor(model_config) + orig_get_hf_processor = processor.info.get_hf_processor + + def get_hf_processor(self, **kwargs): + assert kwargs == call_kwargs + return _ProcessorProxy(orig_get_hf_processor()) + + processor.info.get_hf_processor = MethodType(get_hf_processor, + processor.info) + + out_kwargs = processor._call_hf_processor( + prompt="", + mm_data={}, + mm_kwargs=call_kwargs, + tok_kwargs={}, ) - processor = ctx.get_hf_processor( - DummyProcessor, # type: ignore[arg-type] - **inference_kwargs, - ) - - for k, v in expected_kwargs.items(): - assert getattr(processor, k) == v - - -# yapf: disable -@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy -@pytest.mark.parametrize( - ("config_kwargs", "inference_kwargs", "expected_kwargs"), - [ - ({"a": 1}, {}, {"a": 1, "c": 0}), - ({}, {"a": 1}, {"a": 1, "c": 0}), - # inference_kwargs should take precedence - ({"a": 1}, {"a": 2}, {"a": 2, "c": 0}), - # Should ignore extra kwargs - ({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}), - ({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}), - ], -) -# yapf: enable -def test_hf_processor_call_kwargs( - model_id, - config_kwargs, - inference_kwargs, - expected_kwargs, -): - # Should not be used since there is nothing to convert to tokens - mock_tokenizer = cast(AnyTokenizer, object()) - - ctx = InputProcessingContext( - model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), - tokenizer=mock_tokenizer, - ) - - processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type] - - result = ctx.call_hf_processor(processor, {}, inference_kwargs) - assert result == expected_kwargs + assert out_kwargs == expected_kwargs diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py deleted file mode 100644 index c46ac7a88b751..0000000000000 --- a/tests/speculative_decoding/speculators/test_eagle3.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest -import torch - - -@pytest.mark.parametrize( - "model_path", - [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) -def test_llama(vllm_runner, example_prompts, model_path): - with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens=20) - print(vllm_outputs) - assert vllm_outputs - - -@pytest.mark.parametrize( - "model_path", - [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")]) -def test_qwen(vllm_runner, example_prompts, model_path): - with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens=20) - print(vllm_outputs) - assert vllm_outputs diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py index 4c47b8c43caff..b26bdd34d890e 100644 --- a/tests/tpu/lora/test_lora.py +++ b/tests/tpu/lora/test_lora.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from torch_xla._internal import tpu import vllm from vllm.lora.request import LoRARequest @@ -28,31 +27,25 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch): yield -def setup_vllm(num_loras: int, tp: int) -> vllm.LLM: +def setup_vllm(num_loras: int) -> vllm.LLM: return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct", num_scheduler_steps=1, max_model_len=256, max_seq_len_to_capture=256, max_num_seqs=8, - tensor_parallel_size=tp, enable_lora=True, max_loras=num_loras, max_lora_rank=8) -TPU_TENSOR_PARALLEL_SIZES = [1, tpu.num_available_chips() - ] if tpu.num_available_chips() > 1 else [1] - - -@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES) -def test_single_lora(tp: int): +def test_single_lora(): """ This test ensures we can run a single LoRA adapter on the TPU backend. We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which will force Qwen2.5-3B-Instruct to claim 1+1=1. """ - llm = setup_vllm(1, tp) + llm = setup_vllm(1) prompt = "What is 1+1? \n" @@ -70,8 +63,7 @@ def test_single_lora(tp: int): assert int(answer) == 1 -@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES) -def test_lora_hotswapping(tp: int): +def test_lora_hotswapping(): """ This test ensures we can run multiple LoRA adapters on the TPU backend, even if we only have space to store 1. @@ -87,7 +79,7 @@ def test_lora_hotswapping(tp: int): for i in range(1, 5) ] - llm = setup_vllm(1, tp) + llm = setup_vllm(1) prompt = "What is 1+1? \n" @@ -102,8 +94,7 @@ def test_lora_hotswapping(tp: int): assert int(answer) == i + 1 -@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES) -def test_multi_lora(tp: int): +def test_multi_lora(): """ This test ensures we can run multiple LoRA adapters on the TPU backend, when we have enough space to store all of them. @@ -118,7 +109,7 @@ def test_multi_lora(tp: int): for i in range(1, 5) ] - llm = setup_vllm(4, tp) + llm = setup_vllm(4) prompt = "What is 1+1? \n" diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 21694491dd73a..412df3acff126 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -26,10 +26,12 @@ if not current_platform.is_cuda(): TEXT_ENGINE_ARGS = AsyncEngineArgs( model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, + disable_log_requests=True, ) VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct", - enforce_eager=True) + enforce_eager=True, + disable_log_requests=True) TEXT_PROMPT = "Hello my name is Robert and" diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index 2848420c22085..f37686317fd14 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -213,29 +213,3 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): assert len(num_accepted_tokens_per_pos) == 1 assert isinstance(num_accepted_tokens_per_pos[0], Vector) assert len(num_accepted_tokens_per_pos[0].values) == 5 - - -@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"]) -def test_skip_tokenizer_initialization(model: str, - monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv("VLLM_USE_V1", "1") - # This test checks if the flag skip_tokenizer_init skips the initialization - # of tokenizer and detokenizer. The generated output is expected to contain - # token ids. - llm = LLM( - model=model, - skip_tokenizer_init=True, - enforce_eager=True, - ) - sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) - - with pytest.raises(ValueError, match="cannot pass text prompts when"): - llm.generate("abc", sampling_params) - - outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, - sampling_params=sampling_params) - assert len(outputs) > 0 - completions = outputs[0].outputs - assert len(completions) > 0 - assert completions[0].text == "" - assert completions[0].token_ids diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index c2610a87ac780..6716d27f571f9 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -25,6 +25,7 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2)) engine_args = AsyncEngineArgs( model="ibm-research/PowerMoE-3b", enforce_eager=True, + disable_log_requests=True, tensor_parallel_size=int(os.getenv("TP_SIZE", 1)), data_parallel_size=DP_SIZE, ) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index b3372ce2eca8c..824ff8cca201a 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -44,9 +44,9 @@ from vllm.attention.layer import Attention from vllm.attention.ops.paged_attn import PagedAttention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, make_tensor_with_pad) -from vllm.utils.flashinfer import use_trtllm_decode_attention logger = init_logger(__name__) @@ -56,6 +56,7 @@ if TYPE_CHECKING: class FlashInferBackend(AttentionBackend): + cached_sm100a_supported: Optional[bool] = None @staticmethod def get_name() -> str: @@ -122,6 +123,47 @@ class FlashInferBackend(AttentionBackend): else: raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") + @staticmethod + def use_trtllm_decode_attention( + batch_size: int, + max_seq_len: int, + kv_cache_dtype: str, + num_qo_heads: Optional[int], + num_kv_heads: Optional[int], + attn_head_size: Optional[int], + ) -> bool: + if FlashInferBackend.cached_sm100a_supported is None: + FlashInferBackend.cached_sm100a_supported = ( + current_platform.has_device_capability(100)) + if not FlashInferBackend.cached_sm100a_supported: + return False + # Check if the dimensions are supported by TRTLLM decode attention + if (attn_head_size is None or num_qo_heads is None + or num_kv_heads is None or num_qo_heads // num_kv_heads > 8 + or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): + return False + env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION + if env_value is not None: + logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", + env_value) + # Environment variable is set - respect it + # Making the conditional check for zero because + # the path is automatically enabled if the batch size condition + # is satisfied. + no_use_trtllm = (env_value == "0") + if not no_use_trtllm: + logger.info_once("Using TRTLLM decode attention.") + return not no_use_trtllm + else: + # Environment variable not set - use auto-detection + use_trtllm = (FlashInferBackend.cached_sm100a_supported + and batch_size <= 256 and max_seq_len < 131072 + and kv_cache_dtype == "auto") + if use_trtllm: + logger.warning_once( + "Using TRTLLM decode attention (auto-detected).") + return use_trtllm + @dataclass class PerLayerParameters: @@ -1114,7 +1156,7 @@ class FlashInferImpl(AttentionImpl): assert decode_meta.decode_wrapper._sm_scale == softmax_scale # TODO: @pavanimajety Remove this once the switch happens # inside flashinfer. - if not use_trtllm_decode_attention( + if not FlashInferBackend.use_trtllm_decode_attention( num_decode_tokens, attn_metadata.max_decode_seq_len, kv_cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 0d2c432497c40..1370862d580a5 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -267,15 +267,8 @@ def _support_torch_compile( code.co_filename) return inline_call(parent, func, args, kwargs) - # Disable the C++ compilation of symbolic shape guards. C++-fication - # of symbolic shape guards can improve guard overhead. But, since - # vllm skip guards anyways, setting this flag to False can improve - # compile time. - with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", - False), patch.object( - InliningInstructionTranslator, - 'inline_call', - patched_inline_call): + with patch.object(InliningInstructionTranslator, 'inline_call', + patched_inline_call): output = self.compiled_callable(*args, **kwargs) return output diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index a40a8caf34a88..79518b6f4f965 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -164,6 +164,3 @@ class AttnFusionPass(VllmInductorPass): logger.debug("Fused quantization onto %s attention nodes", count) self.dump_graph(graph, "after_attn_fusion") self.end_and_log() - - def uuid(self): - return VllmInductorPass.hash_source(self, AttentionStaticQuantPattern) diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 2a149c65b3877..810d0801e9f38 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -76,10 +76,9 @@ class InductorPass(CustomGraphPass): for src in srcs: if isinstance(src, str): src_str = src - elif isinstance(src, (types.FunctionType, type)): + elif isinstance(src, types.FunctionType): src_str = inspect.getsource(src) else: - # object instance src_str = inspect.getsource(src.__class__) hasher.update(src_str.encode("utf-8")) return hasher.hexdigest() diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 54f00d5415216..11e03daced160 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -7,7 +7,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform -if current_platform.is_cuda(): +if current_platform.is_cuda_alike(): from .fusion import FusionPass from .collective_fusion import AllReduceFusionPass, AsyncTPPass from .fusion_attn import AttnFusionPass diff --git a/vllm/config.py b/vllm/config.py index 94cdbb1bdea78..6908c5a121dae 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,7 +11,6 @@ import textwrap import uuid import warnings from collections import Counter -from collections.abc import Mapping from contextlib import contextmanager from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass, replace) @@ -39,8 +38,8 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - maybe_override_with_speculators_target_model, try_get_generation_config, - try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) + try_get_generation_config, try_get_safetensors_metadata, + try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect # yapf conflicts with isort for this block @@ -535,15 +534,6 @@ class ModelConfig: "affect the random state of the Python process that " "launched vLLM.", self.seed) - if self.runner != "draft": - # If we're not running the draft model, check for speculators config - # If speculators config, set model / tokenizer to be target model - self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 - model=self.model, - tokenizer=self.tokenizer, - revision=self.revision, - trust_remote_code=self.trust_remote_code) - # Keep set served_model_name before maybe_model_redirect(self.model) self.served_model_name = get_served_model_name(self.model, self.served_model_name) @@ -615,8 +605,8 @@ class ModelConfig: self.config_format, hf_overrides_kw=hf_overrides_kw, hf_overrides_fn=hf_overrides_fn) - self.hf_config = hf_config + self.hf_text_config = get_hf_text_config(self.hf_config) self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None) @@ -812,17 +802,12 @@ class ModelConfig: def _get_transformers_backend_cls(self) -> str: """Determine which Transformers backend class will be used if `model_impl` is set to `transformers` or `auto`.""" - if getattr(self, "runner_type", self.runner) == "pooling": - return "TransformersModel" if self.hf_config != self.hf_text_config: # If 'hf_text_config' is the same as 'hf_config'. If not, it is # probably a composite config, i.e. multimodal return "TransformersForMultimodalLM" - return "TransformersForCausalLM" - - def using_transformers_backend(self) -> bool: - """Check if the model is using the Transformers backend class.""" - return self.architecture == self._get_transformers_backend_cls() + else: + return "TransformersForCausalLM" @property def registry(self): @@ -885,12 +870,6 @@ class ModelConfig: return None - def set_disable_mm_preprocessor_cache(self, value: bool) -> None: - mm_config = self.get_multimodal_config() - - self.disable_mm_preprocessor_cache = value - mm_config.disable_mm_preprocessor_cache = value - def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( self.model, self.revision) @@ -1580,18 +1559,7 @@ class ModelConfig: return self.multimodal_config def try_get_generation_config(self) -> dict[str, Any]: - """ - This method attempts to retrieve the non-default values of the - generation config for this model. - - The generation config can contain information about special tokens, as - well as sampling parameters. Which is why this method exists separately - to `get_diff_sampling_param`. - - Returns: - A dictionary containing the non-default generation config. - """ - if self.generation_config in {"auto", "vllm"}: + if self.generation_config in ("auto", "vllm"): config = try_get_generation_config( self.hf_config_path or self.model, trust_remote_code=self.trust_remote_code, @@ -1610,18 +1578,13 @@ class ModelConfig: def get_diff_sampling_param(self) -> dict[str, Any]: """ - This method returns a dictionary containing the non-default sampling - parameters with `override_generation_config` applied. - - The default sampling parameters are: - - - vLLM's neutral defaults if `self.generation_config="vllm"` - - the model's defaults if `self.generation_config="auto"` - - as defined in `generation_config.json` if - `self.generation_config="path/to/generation_config/dir"` + This method returns a dictionary containing the parameters + that differ from the default sampling parameters. If + `generation_config` is `"vllm"`, an empty dictionary is returned. Returns: - A dictionary containing the non-default sampling parameters. + dict[str, Any]: A dictionary with the differing sampling + parameters, if `generation_config` is `"vllm"` an empty dictionary. """ if self.generation_config == "vllm": config = {} @@ -3021,13 +2984,10 @@ class SpeculativeConfig: "Chunked prefill and EAGLE are not compatible " "when using V0.") - from vllm.transformers_utils.configs import ( - SpeculatorsConfig) from vllm.transformers_utils.configs.eagle import ( EAGLEConfig) - if isinstance(self.draft_model_config.hf_config, - (EAGLEConfig, SpeculatorsConfig)): + EAGLEConfig): pass else: eagle_config = EAGLEConfig( @@ -3186,19 +3146,10 @@ class SpeculativeConfig: "speculative decoding is > 1, but got " f"{self.disable_by_batch_size=}") - from vllm.transformers_utils.configs import SpeculatorsConfig - - eagle3_target_supported = ["llama"] - if self.draft_model_config and isinstance( - self.draft_model_config.hf_config, SpeculatorsConfig): - eagle3_target_supported.append("qwen") - - if self.method == "eagle3" and self.target_model_config and not any( - supported_model in - self.target_model_config.hf_text_config.model_type - for supported_model in eagle3_target_supported): + if self.method == "eagle3" and self.target_model_config and \ + "llama" not in self.target_model_config.hf_text_config.model_type: raise ValueError( - f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501 + "Eagle3 is only supported for Llama models. " f"Got {self.target_model_config.hf_text_config.model_type=}") return self @@ -3392,16 +3343,7 @@ class MultiModalConfig: 999 if envs.VLLM_USE_V1 else 1, ) - def merge_mm_processor_kwargs( - self, - inference_kwargs: Mapping[str, object], - ) -> dict[str, object]: - """ - Get the keyword arguments to pass to the multi-modal processor - according to the extra arguments passed during inference. - """ - kwargs = self.mm_processor_kwargs or {} - return kwargs | dict(inference_kwargs) + # TODO: Add configs to init vision tower or not. @config diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py deleted file mode 100644 index e5ba297ebcc1b..0000000000000 --- a/vllm/distributed/device_communicators/ray_communicator.py +++ /dev/null @@ -1,257 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import uuid -from typing import Any, Optional - -import ray -import torch -from ray.exceptions import RayChannelError -from ray.experimental.channel.communicator import (Communicator, - TorchTensorAllocator) -from torch.distributed import ReduceOp - -from vllm.distributed.device_communicators.base_device_communicator import ( - DeviceCommunicatorBase) -from vllm.distributed.parallel_state import get_pp_group -from vllm.logger import init_logger -from vllm.utils import current_stream - -logger = init_logger(__name__) - - -class RayPPCommunicator(Communicator): - """ - Communicator to be used for pipeline parallelism in Ray Compiled Graph. - This is wraps around the vLLM _PP GroupCoordinator. - - This class is not thread-safe. - """ - - _comm: Optional[DeviceCommunicatorBase] - - def __init__( - self, - world_size: int, - comm_id: Any, - rank: Optional[int], - actor_handles: list["ray.actor.ActorHandle"], - cuda_stream: Optional[torch.cuda.Stream], - use_communication_streams: bool = False, - ): - """ - Initialize a RayPPCommunicator that can be used to communicate with - other Ray Compiled Graph actors for pipeline parallelism. - - Args: - world_size: The number of participating actors. - comm_id: A unique communicator ID. This is just to conform with - the Ray Communicator API and is not used. - rank: The rank of this actor. If None, then the caller is not a - participant of the RayPPCommunicator group (e.g., the Ray - driver). - actor_handles: A list of actor handles. - cuda_stream: A CUDA stream to dispatch communication ops to. This - is not supported. - use_communication_streams: Whether to use communication streams. - This is not supported. - """ - self._world_size = world_size - self._rank: Optional[int] = None - self._actor_handles = actor_handles - if use_communication_streams: - raise NotImplementedError( - "use_communication_streams is not supported") - if cuda_stream is not None and cuda_stream != current_stream(): - raise ValueError( - "cuda_stream other than the current stream is not supported") - - if rank is not None: - # Rank is not None, this is Ray worker - assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned" - - self._comm = get_pp_group().device_communicator - - # Since we wrap around the vLLM _PP communicator, we use - # the rank from the vLLM communicator, and ignore the rank - # passed in from Ray. - # TODO(rui): refactor the Ray Communicator API so that - # it also supports no rank passed in. - self._rank = self._comm.rank_in_group - - self._build_actor_rank_mapping() - else: - # Rank is None, this is Ray driver - self._comm = None - - self._closed = False - - def _build_actor_rank_mapping(self): - """ - Use collective communication to build a mapping from actor IDs to ranks. - This should be called once during initialization. - """ - if self._comm is None: - return {} - - current_actor = ray.get_runtime_context().current_actor - actor_id_str = current_actor._actor_id.hex() - - # Ray actor IDs are 32-character hex strings (128 bits) - ACTOR_ID_LEN = 32 - actor_id_bytes = actor_id_str.encode('utf-8') - assert len( - actor_id_bytes - ) == ACTOR_ID_LEN, f"Unexpected actor ID length: {len(actor_id_bytes)}" - - actor_id_tensor = torch.frombuffer( - actor_id_bytes, dtype=torch.uint8).to(self._comm.device) - - # All-gather full actor IDs from all actors - gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0) - - # Build mapping: actor_id -> device_comm_rank - self._actor_id_to_rank = {} - for rank in range(self._world_size): - start_idx = rank * ACTOR_ID_LEN - end_idx = (rank + 1) * ACTOR_ID_LEN - actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy( - ).tobytes() - actor_id = actor_bytes.decode('utf-8') - self._actor_id_to_rank[actor_id] = rank - - def initialize(self, rank: int) -> None: - # No additional initialization is needed. - pass - - def get_actor_handles(self) -> list["ray.actor.ActorHandle"]: - return self._actor_handles - - def get_rank(self, actor: ray.actor.ActorHandle) -> int: - """ - Return the given actor's rank using device communicator collective ops. - """ - assert hasattr(self, '_actor_id_to_rank'), ( - "Actor rank mapping not built. " - "This should have been done during initialization.") - - actor_id_str = actor._actor_id.hex() - - if actor_id_str in self._actor_id_to_rank: - return self._actor_id_to_rank[actor_id_str] # type: ignore - else: - raise ValueError(f"Actor {actor} not found in communicator group") - - def get_self_rank(self) -> Optional[int]: - """ - Return this actor's rank. - """ - return self._rank - - def get_world_size(self) -> int: - """ - Return the number of ranks in the RayPPCommunicator group. - """ - return self._world_size - - def send(self, buf: "torch.Tensor", peer_rank: int) -> None: - """ - Send a torch.Tensor to a peer. - - This returns when the send kernel has been queued, but the kernel may - not have completed. Therefore, the caller should ensure that there are - no concurrent writes to the sent `buf` until the send has finished. - That is, either all writes should be submitted on the current stream - (self._cuda_stream) or, if on a different stream, that stream should - synchronize with the current stream. - - Args: - buf: The torch.Tensor to send. It should already be on this - actor's default device. - peer_rank: The rank of the actor to send to. - """ - if self._closed: - raise RayChannelError("RayPPCommunicator has been destroyed.") - - assert self._comm is not None - self._comm.send(buf, peer_rank) - - def recv( - self, - shape: tuple[int], - dtype: "torch.dtype", - peer_rank: int, - allocator: TorchTensorAllocator, - ) -> "torch.Tensor": - """ - Receive a torch.Tensor from a peer and synchronize the current stream. - - After this call returns, the receive buffer is safe to read from from - any stream. An RayChannelError will be raised if an error occurred - (e.g., remote actor died), and the buffer is not safe to read. - - Args: - shape: The shape of the tensor to receive. - dtype: The dtype of the tensor to receive. - peer_rank: The rank of the actor to receive from. - allocator: The allocator to use to create the received tensor. - This is ignored for this implementation. - """ - if self._closed: - raise RayChannelError("RayPPCommunicator has been destroyed.") - - assert self._comm is not None - size = torch.Size(shape) - buf = self._comm.recv(size, dtype, src=peer_rank) - - # Buffer values are undefined if NCCL ops are aborted. Therefore, we - # need to synchronize here and check that the channel is still - # open to ensure that the receive buffer is valid. - # TODO(swang): Avoid CUDA synchronization. - current_stream().synchronize() - - if self._closed: - raise RayChannelError("RayPPCommunicator has been destroyed.") - return buf - - def allgather( - self, - send_buf: "torch.Tensor", - recv_buf: "torch.Tensor", - ): - raise NotImplementedError("allgather is not supported") - - def allreduce( - self, - send_buf: "torch.Tensor", - recv_buf: "torch.Tensor", - op: ReduceOp = ReduceOp.SUM, - ): - raise NotImplementedError("allreduce is not supported") - - def reducescatter( - self, - send_buf: "torch.Tensor", - recv_buf: "torch.Tensor", - op: ReduceOp = ReduceOp.SUM, - ): - raise NotImplementedError("reducescatter is not supported") - - @property - def recv_stream(self): - return torch.cuda.StreamContext(current_stream()) - - @property - def send_stream(self): - return torch.cuda.StreamContext(current_stream()) - - def destroy(self) -> None: - # Just sets a flag, vLLM manages the lifecycle of the underlying - # _PP GroupCoordinator. - self._closed = True - - def get_transport_name(self) -> str: - return "nccl" - - @classmethod - def generate_communicator_id(cls) -> Any: - return uuid.uuid4() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9f5e829d0d3aa..f94b57dbb04f2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,7 +18,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List, import regex as re import torch from pydantic import TypeAdapter, ValidationError -from typing_extensions import TypeIs, deprecated +from typing_extensions import TypeIs import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, @@ -217,12 +217,10 @@ Additionally, list elements can be passed individually using `+`: elif contains_type(type_hints, list): type_hint = get_type(type_hints, list) types = get_args(type_hint) - list_type = types[0] - if get_origin(list_type) is Union: - msg = "List type must contain str if it is a Union." - assert str in get_args(list_type), msg - list_type = str - kwargs[name]["type"] = list_type + assert len(types) == 1, ( + "List type must have exactly one type. Got " + f"{type_hint} with types {types}") + kwargs[name]["type"] = types[0] kwargs[name]["nargs"] = "+" elif contains_type(type_hints, int): kwargs[name]["type"] = int @@ -986,28 +984,8 @@ class EngineArgs: provided as a JSON string input via CLI arguments or directly as a dictionary from the engine. """ - - from vllm.transformers_utils.config import get_config - from vllm.transformers_utils.configs.speculators.base import ( - SpeculatorsConfig) - if self.speculative_config is None: - hf_config = get_config(self.hf_config_path or self.model, - self.trust_remote_code, self.revision, - self.code_revision, self.config_format) - - # if loading a SpeculatorsConfig, load the specualtive_config - # details from the config directly - # no user input required / expected - if isinstance(hf_config, SpeculatorsConfig): - # We create one since we dont create one - self.speculative_config = {} - self.speculative_config[ - "num_speculative_tokens"] = hf_config.num_lookahead_tokens - self.speculative_config["model"] = self.model - self.speculative_config["method"] = hf_config.method - else: - return None + return None # Note(Shangming): These parameters are not obtained from the cli arg # '--speculative-config' and must be passed in when creating the engine @@ -1227,18 +1205,6 @@ class EngineArgs: enable_multimodal_encoder_data_parallel, ) - supports_mm_preprocessor_cache = (self.data_parallel_size == 1 - or data_parallel_external_lb) - if (not supports_mm_preprocessor_cache - and model_config.is_multimodal_model - and not model_config.disable_mm_preprocessor_cache): - logger.warning( - "Multi-modal preprocessor cache is not compatible " - "with data parallelism when there does not exist a " - "one-to-one correspondance between API process and " - "EngineCore process, so the cache will be disabled.") - model_config.set_disable_mm_preprocessor_cache(True) - speculative_config = self.create_speculative_config( target_model_config=model_config, target_parallel_config=parallel_config, @@ -1712,23 +1678,7 @@ class EngineArgs: @dataclass class AsyncEngineArgs(EngineArgs): """Arguments for asynchronous vLLM engine.""" - enable_log_requests: bool = False - - @property - @deprecated( - "`disable_log_requests` is deprecated and has been replaced with " - "`enable_log_requests`. This will be removed in v0.12.0. Please use " - "`enable_log_requests` instead.") - def disable_log_requests(self) -> bool: - return not self.enable_log_requests - - @disable_log_requests.setter - @deprecated( - "`disable_log_requests` is deprecated and has been replaced with " - "`enable_log_requests`. This will be removed in v0.12.0. Please use " - "`enable_log_requests` instead.") - def disable_log_requests(self, value: bool): - self.enable_log_requests = not value + disable_log_requests: bool = False @staticmethod def add_cli_args(parser: FlexibleArgumentParser, @@ -1739,15 +1689,9 @@ class AsyncEngineArgs(EngineArgs): load_general_plugins() if not async_args_only: parser = EngineArgs.add_cli_args(parser) - parser.add_argument('--enable-log-requests', - action=argparse.BooleanOptionalAction, - default=AsyncEngineArgs.enable_log_requests, - help='Enable logging requests.') parser.add_argument('--disable-log-requests', - action=argparse.BooleanOptionalAction, - default=not AsyncEngineArgs.enable_log_requests, - help='[DEPRECATED] Disable logging requests.', - deprecated=True) + action='store_true', + help='Disable logging requests.') current_platform.pre_register_and_update(parser) return parser diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 1f962b008ee03..06bb4eeab69eb 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -30,7 +30,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, deprecate_kwargs, weak_bind +from vllm.utils import Device, weak_bind logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -554,20 +554,14 @@ class AsyncLLMEngine(EngineClient): return LLMEngine._get_executor_cls(engine_config) @classmethod - @deprecate_kwargs( - "disable_log_requests", - additional_message=("This argument will have no effect. " - "Use `enable_log_requests` instead."), - ) def from_vllm_config( - cls, - vllm_config: VllmConfig, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, - enable_log_requests: bool = False, - disable_log_stats: bool = False, - disable_log_requests: bool = True, # Deprecated, will be removed + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + disable_log_requests: bool = False, + disable_log_stats: bool = False, ) -> "AsyncLLMEngine": """Create an AsyncLLMEngine from the EngineArgs.""" @@ -575,7 +569,7 @@ class AsyncLLMEngine(EngineClient): vllm_config=vllm_config, executor_class=cls._get_executor_cls(vllm_config), start_engine_loop=start_engine_loop, - log_requests=enable_log_requests, + log_requests=not disable_log_requests, log_stats=not disable_log_stats, usage_context=usage_context, stat_loggers=stat_loggers, @@ -604,7 +598,7 @@ class AsyncLLMEngine(EngineClient): usage_context=usage_context, stat_loggers=stat_loggers, disable_log_stats=engine_args.disable_log_stats, - enable_log_requests=engine_args.enable_log_requests, + disable_log_requests=engine_args.disable_log_requests, ) @property diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 903f3fd71ebcd..fe6eb0d8c2f1a 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -34,7 +34,6 @@ from vllm.outputs import RequestOutput from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext -from vllm.utils import deprecate_kwargs from vllm.worker.model_runner_base import InputProcessingError logger = init_logger(__name__) @@ -121,20 +120,10 @@ class MQLLMEngine: return ENGINE_DEAD_ERROR() @classmethod - @deprecate_kwargs( - "disable_log_requests", - additional_message=("This argument will have no effect. " - "Use `enable_log_requests` instead."), - ) - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext, - enable_log_requests: bool, - disable_log_stats: bool, - ipc_path: str, - disable_log_requests: bool = True, # Deprecated, will be removed - ) -> "MQLLMEngine": + def from_vllm_config(cls, vllm_config: VllmConfig, + usage_context: UsageContext, + disable_log_requests: bool, disable_log_stats: bool, + ipc_path: str) -> "MQLLMEngine": # Setup plugins for each process from vllm.plugins import load_general_plugins load_general_plugins() @@ -147,7 +136,7 @@ class MQLLMEngine: ipc_path=ipc_path, usage_context=usage_context, use_async_sockets=use_async_sockets, - log_requests=enable_log_requests, + log_requests=(not disable_log_requests), log_stats=(not disable_log_stats), ) @@ -161,7 +150,7 @@ class MQLLMEngine: ipc_path=ipc_path, vllm_config=vllm_config, usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, + disable_log_requests=engine_args.disable_log_requests, disable_log_stats=engine_args.disable_log_stats, ) @@ -447,7 +436,7 @@ def signal_handler(*_) -> None: def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, ipc_path: str, disable_log_stats: bool, - enable_log_requests: bool, engine_alive): + disable_log_requests: bool, engine_alive): try: # Ensure we can serialize transformer config before spawning maybe_register_config_serialize_by_value() @@ -456,7 +445,7 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, vllm_config=vllm_config, usage_context=usage_context, disable_log_stats=disable_log_stats, - enable_log_requests=enable_log_requests, + disable_log_requests=disable_log_requests, ipc_path=ipc_path) signal.signal(signal.SIGTERM, signal_handler) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 9762a1de9edd3..7dcba2cccdb52 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -2,7 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse +import os import signal +import sys from typing import Optional import uvloop @@ -16,10 +18,10 @@ from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, show_filtered_argument_or_group_from_help) +from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri, - set_process_title) +from vllm.utils import FlexibleArgumentParser, get_tcp_uri from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines from vllm.v1.executor.abstract import Executor @@ -75,7 +77,7 @@ def run_headless(args: argparse.Namespace): if args.api_server_count > 1: raise ValueError("api_server_count can't be set in headless mode") - + # set_process_title("Headless_ProcManager") # Create the EngineConfig. engine_args = vllm.AsyncEngineArgs.from_cli_args(args) usage_context = UsageContext.OPENAI_API_SERVER @@ -140,6 +142,8 @@ def run_multi_api_server(args: argparse.Namespace): orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache + # set_process_title("ProcManager") + if num_api_servers > 1: setup_multiprocess_prometheus() @@ -163,9 +167,8 @@ def run_multi_api_server(args: argparse.Namespace): if model_config.is_multimodal_model and not ( orig_disable_mm_preprocessor_cache): - logger.warning( - "Multi-modal preprocessor cache is not compatible " - "with api_server_count > 1, so the cache will be disabled.") + logger.warning("Multi-model preprocessor cache will be disabled " + "for api_server_count > 1") executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats @@ -224,10 +227,12 @@ def run_api_server_worker_proc(listen_address, **uvicorn_kwargs) -> None: """Entrypoint for individual API server worker processes.""" - # Set process title and add process-specific prefix to stdout and stderr. - server_index = client_config.get("client_index", 0) if client_config else 0 - set_process_title("APIServer", str(server_index)) - decorate_logs() + # Add process-specific prefix to stdout and stderr. + from multiprocessing import current_process + process_name = current_process().name + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) uvloop.run( run_server_worker(listen_address, sock, args, client_config, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9bf4702320788..05d9a69a65f83 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -11,6 +11,7 @@ import multiprocessing import os import signal import socket +import sys import tempfile import uuid from argparse import Namespace @@ -94,15 +95,15 @@ from vllm.entrypoints.openai.serving_transcription import ( from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.entrypoints.utils import (cli_env_setup, load_aware_call, log_non_default_args, with_cancellation) +from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, - get_open_zmq_ipc_path, is_valid_ipv6_address, - set_ulimit) +from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, + is_valid_ipv6_address, set_process_title, set_ulimit) from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -199,18 +200,15 @@ async def build_async_engine_client_from_engine_args( from vllm.v1.engine.async_llm import AsyncLLM async_llm: Optional[AsyncLLM] = None - client_count = client_config.pop( - "client_count") if client_config else 1 client_index = client_config.pop( "client_index") if client_config else 0 try: async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, + disable_log_requests=engine_args.disable_log_requests, disable_log_stats=engine_args.disable_log_stats, client_addresses=client_config, - client_count=client_count, client_index=client_index) # Don't keep the dummy data in memory @@ -230,7 +228,7 @@ async def build_async_engine_client_from_engine_args( engine_client = AsyncLLMEngine.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, + disable_log_requests=engine_args.disable_log_requests, disable_log_stats=engine_args.disable_log_stats) yield engine_client finally: @@ -275,7 +273,7 @@ async def build_async_engine_client_from_engine_args( target=run_mp_engine, args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path, engine_args.disable_log_stats, - engine_args.enable_log_requests, engine_alive)) + engine_args.disable_log_requests, engine_alive)) engine_process.start() engine_pid = engine_process.pid assert engine_pid is not None, "Engine process failed to start." @@ -1573,10 +1571,10 @@ async def init_app_state( else: served_model_names = [args.model] - if args.enable_log_requests: - request_logger = RequestLogger(max_log_len=args.max_log_len) - else: + if args.disable_log_requests: request_logger = None + else: + request_logger = RequestLogger(max_log_len=args.max_log_len) base_model_paths = [ BaseModelPath(name=name, model_path=args.model) @@ -1810,7 +1808,10 @@ async def run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server.""" # Add process-specific prefix to stdout and stderr. - decorate_logs("APIServer") + process_name = "APIServer" + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) listen_address, sock = setup_server(args) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) @@ -1827,7 +1828,7 @@ async def run_server_worker(listen_address, ToolParserManager.import_tool_parser(args.tool_parser_plugin) server_index = client_config.get("client_index", 0) if client_config else 0 - + set_process_title("APIServer", str(server_index)) # Load logging config for uvicorn if specified log_config = load_log_config(args.log_config_file) if log_config is not None: diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index d77aee345843c..b6b3bf3f530e3 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -859,15 +859,6 @@ class ChatCompletionRequest(OpenAIBaseModel): 'are supported.' ) - # if tool_choice is "required" but the "tools" list is empty, - # override the data to behave like "none" to align with - # OpenAI’s behavior. - if data["tool_choice"] == "required" and isinstance( - data["tools"], list) and len(data["tools"]) == 0: - data["tool_choice"] = "none" - del data["tools"] - return data - # ensure that if "tool_choice" is specified as an object, # it matches a valid tool correct_usage_message = 'Correct usage: `{"type": "function",' \ diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index d146ad485d194..137b368dad202 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -324,10 +324,10 @@ async def run_batch( else: served_model_names = [args.model] - if args.enable_log_requests: - request_logger = RequestLogger(max_log_len=args.max_log_len) - else: + if args.disable_log_requests: request_logger = None + else: + request_logger = RequestLogger(max_log_len=args.max_log_len) base_model_paths = [ BaseModelPath(name=name, model_path=args.model) diff --git a/vllm/envs.py b/vllm/envs.py index 2d470c6dccbfd..19bc9156b2586 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -55,7 +55,6 @@ if TYPE_CHECKING: VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False - VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True VLLM_XLA_USE_SPMD: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") @@ -69,6 +68,7 @@ if TYPE_CHECKING: MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False + VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_NO_DEPRECATION_WARNING: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False @@ -126,7 +126,6 @@ if TYPE_CHECKING: VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None VLLM_TPU_USING_PATHWAYS: bool = False VLLM_USE_DEEP_GEMM: bool = False - VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False VLLM_XGRAMMAR_CACHE_MB: int = 0 @@ -214,7 +213,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Target device of vLLM, supporting [cuda (by default), # rocm, neuron, cpu] "VLLM_TARGET_DEVICE": - lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), + lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs @@ -229,8 +228,14 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( - os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in + ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + + # Used to mark that setup.py is running in a Docker build context, + # in order to force the use of precompiled binaries. + "VLLM_DOCKER_BUILD_CONTEXT": + lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in + ("1", "true"), # Whether to force using nightly wheel in python build. # This is used for testing the nightly wheel in python build. @@ -500,13 +505,6 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0")) ), - # If the env var is set, it uses a Ray Communicator wrapping - # vLLM's pipeline parallelism communicator to interact with Ray's - # Compiled Graph. Otherwise, it uses Ray's NCCL communicator. - # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set. - "VLLM_USE_RAY_WRAPPED_PP_COMM": - lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))), - # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": @@ -911,14 +909,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), - # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm - # JIT all the required kernels before model execution so there is no - # JIT'ing in the hot-path. However, this warmup increases the engine - # startup time by a couple of minutes. - # Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup. - "VLLM_SKIP_DEEP_GEMM_WARMUP": - lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))), - # Allow use of FlashInfer MoE kernels for fused moe ops. "VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))), diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 48b3479ed7997..a6c172beff7bb 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -3,20 +3,21 @@ import asyncio import os +import sys import threading import uuid from dataclasses import dataclass from multiprocessing import Queue from multiprocessing.connection import wait from multiprocessing.process import BaseProcess -from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union +from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO, + TypeVar, Union) import torch from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context, - run_method) +from vllm.utils import _maybe_force_spawn, get_mp_context, run_method logger = init_logger(__name__) @@ -24,6 +25,10 @@ T = TypeVar('T') _TERMINATE = "TERMINATE" # sentinel +# ANSI color codes +CYAN = '\033[1;36m' +RESET = '\033[0;0m' + JOIN_TIMEOUT_S = 2 @@ -208,7 +213,9 @@ def _run_worker_process( # Add process-specific prefix to stdout and stderr process_name = get_mp_context().current_process().name - decorate_logs(process_name) + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) # Initialize worker worker = worker_factory(vllm_config, rank) @@ -253,6 +260,33 @@ def _run_worker_process( logger.info("Worker exiting") +def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: + """Prepend each output line with process-specific prefix""" + + prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " + file_write = file.write + + def write_with_prefix(s: str): + if not s: + return + if file.start_new_line: # type: ignore[attr-defined] + file_write(prefix) + idx = 0 + while (next_idx := s.find('\n', idx)) != -1: + next_idx += 1 + file_write(s[idx:next_idx]) + if next_idx == len(s): + file.start_new_line = True # type: ignore[attr-defined] + return + file_write(prefix) + idx = next_idx + file_write(s[idx:]) + file.start_new_line = False # type: ignore[attr-defined] + + file.start_new_line = True # type: ignore[attr-defined] + file.write = write_with_prefix # type: ignore[method-assign] + + def set_multiprocessing_worker_envs(parallel_config): """ Set up environment variables that should be used when there are workers in a multiprocessing environment. This should be called by the parent diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 37c3fe59c65dd..e9ad62aeb99a8 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -608,21 +608,6 @@ class RayDistributedExecutor(DistributedExecutorBase): forward_dag = MultiOutputNode(outputs) - if envs.VLLM_USE_RAY_WRAPPED_PP_COMM: - from ray.experimental.channel.accelerator_context import ( - register_accelerator_context) - - from vllm.distributed.device_communicators.ray_communicator import ( - RayPPCommunicator) - register_accelerator_context(torch_module_name="cuda", - communicator_cls=RayPPCommunicator) - logger.info("Using RayPPCommunicator " - "(which wraps vLLM _PP GroupCoordinator) " - "for Ray Compiled Graph communication.") - else: - logger.info("Using Ray's NCCL communicator for " - "Ray Compiled Graph communication.") - return forward_dag.experimental_compile( enable_asyncio=enable_asyncio, _overlap_gpu_communication=envs. diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 6331a70b469aa..652136fbbfe73 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -11,7 +11,7 @@ from typing_extensions import TypeVar from vllm.jsontree import JSONTree, json_map_leaves from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.utils import get_allowed_kwarg_only_overrides +from vllm.utils import resolve_mm_processor_kwargs if TYPE_CHECKING: from vllm.config import ModelConfig @@ -154,11 +154,14 @@ class InputProcessingContext(InputContext): assert callable(hf_processor) mm_config = self.model_config.get_multimodal_config() - merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs) + base_kwargs = mm_config.mm_processor_kwargs + if base_kwargs is None: + base_kwargs = {} - allowed_kwargs = get_allowed_kwarg_only_overrides( + merged_kwargs = resolve_mm_processor_kwargs( + base_kwargs, + kwargs, hf_processor, - merged_kwargs, requires_kw_only=False, allow_var_kwargs=True, ) @@ -170,9 +173,7 @@ class InputProcessingContext(InputContext): return x try: - output = hf_processor(**data, - **allowed_kwargs, - return_tensors="pt") + output = hf_processor(**data, **merged_kwargs, return_tensors="pt") # this emulates output.to(dtype=self.model_config.dtype) if isinstance(output, BatchFeature): cast_output = json_map_leaves(maybe_cast_dtype, output.data) @@ -188,7 +189,7 @@ class InputProcessingContext(InputContext): except Exception as exc: msg = (f"Failed to apply {type(hf_processor).__name__} " - f"on data={data} with kwargs={allowed_kwargs}") + f"on data={data} with kwargs={merged_kwargs}") raise ValueError(msg) from exc diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index de5933d6d41e5..c3512ec3dbd43 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -682,14 +682,12 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): def slice_lora_b( self, lora_b: list[Union[torch.Tensor, None]] ) -> list[Union[torch.Tensor, None]]: - sliced_lora_b = [None] * self.n_slices for i, (shard_id, shard_size) in enumerate( zip(self.output_ids, self.output_slices)): if (lora_b_i := lora_b[i]) is not None: - sliced_lora_b[i] = lora_b_i[:, - shard_size * shard_id:shard_size * - (shard_id + 1)] - return sliced_lora_b + lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size * + (shard_id + 1)] + return lora_b def slice_bias( self, bias: list[Union[torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index bd3605378b6dc..b89e5ac6f093e 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -4,9 +4,7 @@ import functools from typing import Any, Optional import torch -from tqdm import tqdm -import vllm.envs as env import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig @@ -19,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) -from vllm.utils import has_deep_gemm, run_once +from vllm.utils import has_deep_gemm from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous logger = init_logger(__name__) @@ -84,65 +82,6 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor, return True -@run_once -def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - num_topk: int): - """ - DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the - input tensor shapes. In this function, we construct all possible input - tensor shapes so all the kernels are JIT'ed and cached. - Note that this warmup is expected to happen during the model profile - call and not during actual model inference. - """ - - assert w1.size(0) == w2.size(0), ( - "w1 and w2 must have the same number of experts") - - block_m = deep_gemm_block_shape()[0] - num_experts = w1.size(0) - device = w1.device - - # This is the maximum GroupedGemm M size that we expect to run - # the grouped_gemm with. - MAX_M = compute_aligned_M(env.VLLM_FUSED_MOE_CHUNK_SIZE, - num_topk, - num_experts, - block_m, - expert_tokens_meta=None) - # Distribute expert-ids evenly. - MAX_BLOCKS = MAX_M // block_m - expert_ids_block = torch.randint(low=0, - high=num_experts, - size=(MAX_BLOCKS, ), - device=device, - dtype=torch.int32) - expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0) - - def _warmup(w: torch.Tensor, w_scale: torch.Tensor): - - _, n, k = w.size() - a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn) - a1q_scales = torch.empty((MAX_M, k // block_m), - device=device, - dtype=torch.float32) - out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16) - - pbar = tqdm(total=MAX_BLOCKS, - desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})") - num_tokens = MAX_M - while num_tokens > 0: - m_grouped_fp8_gemm_nt_contiguous( - (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale), - out[:num_tokens], expert_ids[:num_tokens]) - pbar.update(1) - num_tokens = num_tokens - block_m - - _warmup(w1, w1_scale) - _warmup(w2, w2_scale) - - class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__(self): @@ -217,20 +156,6 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): ): assert self.block_shape is not None assert a1q_scale is not None - assert w1_scale is not None - assert w2_scale is not None - - if not env.VLLM_SKIP_DEEP_GEMM_WARMUP: - # DeepGemm JITs the grouped-gemm kernels. We don't want the JIT'ing - # to happen during actual model-inference. The - # `warmup_deepgemm_kernels` function is a `run_once` decorated - # function that executes during the model profile run. This warmup - # should create all the required JITs for the current model. - warmup_deepgemm_gg_contiguous_kernels(w1, - w2, - w1_scale, - w2_scale, - num_topk=topk_ids.size(1)) a1q = hidden_states _, N, K = w1.size() diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index f6b62254e7b4c..7016ff34c3a85 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -144,13 +144,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): "apply_router_weight_on_input is only implemented for topk=1") a1 = a1 * topk_weights.to(a1.dtype) - if quant_config.is_block_quantized: - # Quant and Dispatch + if quant_config.per_act_token_quant: a1q, a1q_scale = moe_kernel_quantize_input( a1, a1_scale, quant_dtype=quant_config.quant_dtype, - per_act_token_quant=quant_config.per_act_token_quant, + per_act_token_quant=True, block_shape=quant_config.block_shape, ) if a1q_scale is not None and a1q_scale.numel() == 1: @@ -163,10 +162,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): rank_topk_weights=topk_weights, num_experts=num_experts) else: - # Dispatch and Quant - # DeepEP kernels only support dispatching block-quantized - # activation scales. - # Dispatch in bfloat16 + # DeepEP kernels only support dispatching per-token-quant + # quantization. dispatch in bfloat16. (expert_x, _, expert_tokens_meta, expert_topk_ids, expert_topk_weights) = self._do_dispatch( tokens=a1, @@ -174,7 +171,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): rank_topk_ids=topk_ids, rank_topk_weights=topk_weights, num_experts=num_experts) - # Quantize after dispatch. + # quantize now expert_x_scale = None if expert_x.numel() != 0: expert_x, expert_x_scale = moe_kernel_quantize_input( diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 984239ba218a4..cb9842ef7dfe6 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -761,8 +761,8 @@ def get_moe_wna16_block_config(config: dict[str, def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int, num_experts: int, bit: int): - return current_platform.is_cuda() and bit == 4 and \ - group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6 + return bit == 4 and group_size in [32, 64, 128] and \ + num_valid_tokens / num_experts <= 6 def get_default_config( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 09d8890888fa8..17b41e8a1c23c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -17,14 +17,9 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported) -from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa - FlashInferCutlassMoEPrepareAndFinalize) from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP) from vllm.model_executor.layers.quantization.utils import replace_parameter -from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( - build_flashinfer_fp4_cutlass_moe_kernel, - flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_moe_marlin_supports_layer, marlin_make_workspace_new, marlin_moe_permute_scales) @@ -33,7 +28,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( prepare_moe_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.quant_utils import ( - swizzle_blockscale) + cutlass_fp4_supported, swizzle_blockscale) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs @@ -101,14 +96,8 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): def __init__(self): - from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 - detect_nvfp4_moe_support) - _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) - self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported - self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass - self.use_marlin = _nvfp4.use_marlin + self.use_marlin = not cutlass_fp4_supported() self.group_size = 16 - self.fused_experts = None # type: ignore[assignment] def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -211,14 +200,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): layer.w2_weight = torch.nn.Parameter(layer.w2_weight_packed.data, requires_grad=False) - # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel. - if self.allow_flashinfer_cutlass: - w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data, - layer.w13_weight_scale.data, - dim=-2) - layer.w13_weight = torch.nn.Parameter(w, requires_grad=False) - layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False) - if not torch.allclose(layer.w13_weight_global_scale[:, 0], layer.w13_weight_global_scale[:, 1]): logger.warning_once( @@ -265,21 +246,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): layer.w2_input_scale_quant = torch.nn.Parameter( (layer.w2_input_global_scale), requires_grad=False) - def maybe_swap_experts_impl(self, moe_parallel_config): - if not self.allow_flashinfer_cutlass: - return - self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( - moe_parallel_config) - - def select_gemm_impl(self, prepare_finalize, moe): - """Return the appropriate GEMM experts implementation.""" - assert moe is not None and prepare_finalize is not None - from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 - select_nvfp4_gemm_impl) - - return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe, - logger) - def apply( self, layer: torch.nn.Module, @@ -337,23 +303,10 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): global_num_experts=global_num_experts, expert_map=expert_map) - # FlashInfer fused experts path - if self.fused_experts is not None: - return flashinfer_fp4_cutlass_moe_forward( - self.fused_experts, - layer, - x, - topk_weights, - topk_ids, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, - ) - assert expert_map is None, ("Expert Parallelism / expert_map " "is currently not supported for " "CompressedTensorsW4A4MoeMethod.") + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( cutlass_moe_fp4) diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index f18c936bac605..d3ab1be3bee01 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -10,11 +10,10 @@ import torch from torch.nn.parameter import Parameter from vllm import _custom_ops as ops -from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, QuantizeMethodBase) + QuantizationConfig) from vllm.model_executor.layers.quantization.utils.gptq_utils import ( get_linear_quant_method) from vllm.model_executor.parameter import (ChannelQuantScaleParameter, @@ -111,23 +110,8 @@ class GPTQConfig(QuantizationConfig): return cls(weight_bits, group_size, desc_act, lm_head_quantized, dynamic) - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]: - if isinstance(layer, FusedMoE): - # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility - from .moe_wna16 import MoeWNA16Config - - config = { - "quant_method": "gptq", - "bits": self.weight_bits, - "group_size": self.group_size, - "sym": True, # GPTQ typically uses symmetric quantization - "lm_head": False, - } - return MoeWNA16Config.from_config(config).get_quant_method( - layer, prefix) - + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["GPTQLinearMethod"]: return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 0334a2824512d..b8ffcf90c022b 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -10,8 +10,11 @@ from torch.nn.parameter import Parameter import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant +from vllm.distributed import get_ep_group from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + FlashInferCutlassMoEPrepareAndFinalize) from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, @@ -20,9 +23,6 @@ from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod -from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( - build_flashinfer_fp4_cutlass_moe_kernel, - flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31) @@ -35,6 +35,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp, requantize_with_max_scale) from vllm.model_executor.parameter import (ModelWeightParameter, PerTensorScaleParameter) +from vllm.platforms import current_platform from vllm.scalar_type import scalar_types from vllm.utils.flashinfer import has_flashinfer_moe @@ -868,12 +869,28 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def __init__(self, quant_config: ModelOptNvFp4Config): self.quant_config = quant_config - from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 - detect_nvfp4_moe_support) - _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) - self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported - self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass - self.use_marlin = _nvfp4.use_marlin + self.cutlass_nvfp4_supported = cutlass_fp4_supported() + self.use_marlin = False + self.allow_flashinfer_cutlass = False + + if envs.VLLM_USE_FLASHINFER_MOE_FP4: + if self.cutlass_nvfp4_supported and current_platform.is_cuda() \ + and current_platform.is_device_capability(100): + logger.info_once( + "Using FlashInfer kernels for ModelOptNvFp4FusedMoE.") + self.allow_flashinfer_cutlass = True + else: + logger.warning_once( + "Flashinfer CUTLASS Fused MoE not supported " + "or found on the current platform.") + + if not self.cutlass_nvfp4_supported: + if is_fp4_marlin_supported(): + self.use_marlin = True + else: + raise ValueError("Current platform does not support NVFP4" + " quantization. Please use Blackwell and" + " above.") self.fused_experts = None # type: ignore @@ -883,8 +900,29 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): ): if not self.allow_flashinfer_cutlass: return - self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( - moe_parallel_config) + + logger.debug_once("FlashInferExperts") + # default to TP/EP case only + + experts_kwargs: dict[str, Any] = { + "use_nvfp4_w4a4": True, + "use_dp": moe_parallel_config.dp_size > 1, + "ep_rank": moe_parallel_config.ep_rank, + "ep_size": moe_parallel_config.ep_size, + "tp_rank": moe_parallel_config.tp_rank, + "tp_size": moe_parallel_config.tp_size, + } + + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + FlashInferExperts) + experts = FlashInferExperts(**experts_kwargs) + self.fused_experts = mk.FusedMoEModularKernel( + FlashInferCutlassMoEPrepareAndFinalize( + quant_dtype=torch.uint8, + #meaning 2x e2m1 packed in one, kernel requirement + ), + experts, + ) # This method update self.fused_experts # only prepare_finalize is not None call select_gemm_impl @@ -893,12 +931,32 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def select_gemm_impl(self, prepare_finalize, moe) -> mk.FusedMoEPermuteExpertsUnpermute: - assert moe is not None and prepare_finalize is not None - from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 - select_nvfp4_gemm_impl) + assert moe is not None + assert prepare_finalize is not None + experts = None + all2all_manager = get_ep_group().device_communicator.all2all_manager + assert all2all_manager is not None + if self.allow_flashinfer_cutlass: + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + FlashInferExperts) + logger.debug_once("Using FlashInferExperts") + experts = FlashInferExperts( + use_nvfp4_w4a4=True, + use_dp=moe.moe_parallel_config.dp_size > 1, + ep_rank=moe.moe_parallel_config.ep_rank, + ep_size=moe.moe_parallel_config.ep_size, + tp_rank=moe.moe_parallel_config.tp_rank, + tp_size=moe.moe_parallel_config.tp_size, + ) + else: + assert moe.dp_size > 1 + logger.debug_once("Using CutlassExpertsFp4") + # Currently CutlassExpertsFp4 doesn't support DP + raise ValueError("CutlassExpertsFp4 doesn't support DP. " + "Use flashinfer CUTLASS FusedMoE backend instead " + "(set VLLM_USE_FLASHINFER_MOE_FP4=1)") - return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe, - logger) + return experts def uses_weight_scale_2_pattern(self) -> bool: """ @@ -1004,8 +1062,18 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): gemm1_weight_scale = layer.w13_weight_scale.data if self.allow_flashinfer_cutlass: - gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1( - gemm1_weight, gemm1_weight_scale, dim=-2) + dim = -2 + size = gemm1_weight.size(dim) + assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}" + half = size // 2 + + # Reorder weight + w1, w3 = gemm1_weight.split(half, dim=dim) + gemm1_weight = torch.cat([w3, w1], dim=dim).contiguous() + + # Reorder scale + s1, s3 = gemm1_weight_scale.split(half, dim=dim) + gemm1_weight_scale = torch.cat([s3, s1], dim=dim).contiguous() layer.w13_weight = Parameter(gemm1_weight, requires_grad=False) layer.w13_weight_scale = Parameter(gemm1_weight_scale, @@ -1149,15 +1217,49 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input) else: - out = flashinfer_fp4_cutlass_moe_forward( - self.fused_experts, - layer, - x, - topk_weights, - topk_ids, + # TP or DP case + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + is_valid_flashinfer_cutlass_fused_moe) + assert is_valid_flashinfer_cutlass_fused_moe( + x, layer.w13_weight, layer.w2_weight), ( + "Flashinfer CUTLASS Fused MoE not applicable!") + + a1_gscale = layer.w13_input_scale_quant + a2_gscale = layer.w2_input_scale_quant + extra_expert_args = { + 'g1_alphas': layer.g1_alphas, + 'g2_alphas': layer.g2_alphas, + 'out_dtype': x.dtype, + # Avoid confusion with a1_scale and a2_scale + # where are batch size related. + 'a1_gscale': a1_gscale, + 'a2_gscale': a2_gscale, + } + extra_prepare_args = { + 'use_dp': layer.dp_size > 1, + 'local_tokens': x.shape[0], + 'a1_gscale': a1_gscale, + } + extra_finalize_args = { + 'use_dp': layer.dp_size > 1, + 'local_tokens': x.shape[0], + } + + out = self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, # TODO(shuw): fix later, now output is high prec activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, + w1_scale=layer.w13_blockscale_swizzled, + w2_scale=layer.w2_blockscale_swizzled, apply_router_weight_on_input=apply_router_weight_on_input, + extra_expert_args=extra_expert_args, + extra_prepare_args=extra_prepare_args, + extra_finalize_args=extra_finalize_args, ) return out diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py deleted file mode 100644 index 4c617e226041f..0000000000000 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ /dev/null @@ -1,154 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Utility helpers for NVFP4 + FlashInfer fused-MoE path""" -from __future__ import annotations - -from typing import Optional - -import torch - -import vllm.envs as envs -import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig -from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( - FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe) -from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 - FlashInferCutlassMoEPrepareAndFinalize) -from vllm.platforms import current_platform - -logger = init_logger(__name__) - -__all__ = [ - "is_flashinfer_fp4_cutlass_moe_available", - "reorder_w1w3_to_w3w1", - "build_flashinfer_fp4_cutlass_moe_kernel", - "flashinfer_fp4_cutlass_moe_forward", -] - - -def is_flashinfer_fp4_cutlass_moe_available() -> bool: - """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used.""" - return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda() - and current_platform.is_device_capability(100)) - - -def reorder_w1w3_to_w3w1(weight: torch.Tensor, - scale: torch.Tensor, - dim: int = -2) -> tuple[torch.Tensor, torch.Tensor]: - """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`""" - size = weight.size(dim) - assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}" - half = size // 2 - - w1, w3 = weight.split(half, dim=dim) - s1, s3 = scale.split(half, dim=dim) - - return (torch.cat([w3, w1], - dim=dim).contiguous(), torch.cat([s3, s1], - dim=dim).contiguous()) - - -def build_flashinfer_fp4_cutlass_moe_kernel( - moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel: - """Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel""" - experts = FlashInferExperts( - use_nvfp4_w4a4=True, - use_dp=moe_parallel_config.dp_size > 1, - ep_rank=moe_parallel_config.ep_rank, - ep_size=moe_parallel_config.ep_size, - tp_rank=moe_parallel_config.tp_rank, - tp_size=moe_parallel_config.tp_size, - ) - logger.debug_once("FlashInferExperts (util)") - return mk.FusedMoEModularKernel( - FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8), - experts, - ) - - -def flashinfer_fp4_cutlass_moe_forward( - fused_experts: mk.FusedMoEModularKernel, - layer: torch.nn.Module, - x: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str, - global_num_experts: int, - expert_map: Optional[torch.Tensor], - apply_router_weight_on_input: bool, -) -> torch.Tensor: - """Common forward wrapper for FlashInfer NV-FP4 fused-MoE""" - - assert is_valid_flashinfer_cutlass_fused_moe( - x, layer.w13_weight, - layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!") - - a1_gscale = layer.w13_input_scale_quant - a2_gscale = layer.w2_input_scale_quant - - extra_expert_args = { - "g1_alphas": layer.g1_alphas, - "g2_alphas": layer.g2_alphas, - # Avoid confusion with a1_scale and a2_scale - # where are batch size related. - "a1_gscale": a1_gscale, - "a2_gscale": a2_gscale, - "out_dtype": x.dtype, - } - extra_prepare_args = { - "use_dp": layer.dp_size > 1, - "local_tokens": x.shape[0], - "a1_gscale": a1_gscale, - } - extra_finalize_args = { - "use_dp": layer.dp_size > 1, - "local_tokens": x.shape[0], - } - - return fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=False, # TODO(shuw): fix later, now output is high prec - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=layer.w13_blockscale_swizzled, - w2_scale=layer.w2_blockscale_swizzled, - apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args, - extra_prepare_args=extra_prepare_args, - extra_finalize_args=extra_finalize_args, - ) - - -def select_nvfp4_gemm_impl( - allow_flashinfer_cutlass: bool, - moe, # FusedMoEConfig - logger): - """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" - - # lazy import - from vllm.distributed import get_ep_group - - all2all_manager = get_ep_group().device_communicator.all2all_manager - assert all2all_manager is not None - - if allow_flashinfer_cutlass: - logger.debug_once("Using FlashInferExperts") - return FlashInferExperts( - use_nvfp4_w4a4=True, - use_dp=moe.moe_parallel_config.dp_size > 1, - ep_rank=moe.moe_parallel_config.ep_rank, - ep_size=moe.moe_parallel_config.ep_size, - tp_rank=moe.moe_parallel_config.tp_rank, - tp_size=moe.moe_parallel_config.tp_size, - ) - - # native cutlass experts currently don't support DP; TP case won't call this - raise ValueError( - "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS " - "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)") diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py deleted file mode 100644 index 23a749467f193..0000000000000 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from dataclasses import dataclass - -import vllm.envs as envs -from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( - is_flashinfer_fp4_cutlass_moe_available) -from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( - is_fp4_marlin_supported) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - cutlass_fp4_supported) - -__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"] - -_logger = init_logger(__name__) - - -@dataclass(frozen=True) -class NvFp4Support: - """Result container for NV-FP4 capability probing.""" - - cutlass_supported: bool - allow_flashinfer_cutlass: bool - use_marlin: bool - - -def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support: - """Detect platform support for NV-FP4 fused-MoE path""" - cutlass_supported = cutlass_fp4_supported() - - allow_flashinfer = (cutlass_supported - and is_flashinfer_fp4_cutlass_moe_available()) - - if allow_flashinfer: - _logger.info_once("Using FlashInfer kernels for %s.", class_name - or "NVFP4 path") - else: - if envs.VLLM_USE_FLASHINFER_MOE_FP4: - _logger.warning_once( - "FlashInfer kernels unavailable for %s on current platform.", - class_name or "NVFP4 path", - ) - - use_marlin = False - if not cutlass_supported: - if is_fp4_marlin_supported(): - use_marlin = True - _logger.info_once("Falling back to Marlin FP4 MoE kernel.") - else: - raise ValueError( - "Current platform does not support NVFP4 quantization. " - "Please use Blackwell GPUs or enable FlashInfer.") - - return NvFp4Support( - cutlass_supported=cutlass_supported, - allow_flashinfer_cutlass=allow_flashinfer, - use_marlin=use_marlin, - ) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index f54dfab5238e1..68fcb785691c8 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -34,8 +34,7 @@ from vllm.model_executor.model_loader.weight_utils import ( filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models import is_pooling_model -from vllm.model_executor.utils import (get_moe_expert_mapping, - get_packed_modules_mapping, +from vllm.model_executor.utils import (get_packed_modules_mapping, set_weight_attrs) from vllm.platforms import current_platform @@ -44,12 +43,6 @@ from vllm.platforms import current_platform logger = init_logger(__name__) -def is_moe_model(model: torch.nn.Module) -> bool: - """Checks if the model contains FusedMoE layers.""" - return bool(any( - isinstance(module, FusedMoE) for module in model.modules())) - - class BitsAndBytesModelLoader(BaseModelLoader): """Model loader to load model weights with BitAndBytes quantization.""" @@ -68,8 +61,6 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Store all module names (from transformers) that support # BNB quantization. self.target_modules: list[str] = [] - # Store the mapping of expert parameters for MoE models. - self.expert_params_mapping: list[tuple[str, str, int, str]] = [] # mapping weight names from transformers to vllm. self.weight_mapper: Callable = lambda name: name self.pre_quant: bool = False @@ -422,8 +413,13 @@ class BitsAndBytesModelLoader(BaseModelLoader): # in case model has a mixture of disk-merged and disk-split # weights with same last name. self.target_modules.append(name) - elif isinstance(module, FusedMoE) and hasattr( - module.quant_method, "quant_config"): + elif (isinstance(module, FusedMoE) + and hasattr(module.quant_method, "quant_config")): + if not hasattr(model, "get_expert_mapping"): + raise AttributeError( + f"MoE Model {type(model).__name__} does not support " + "BitsAndBytes quantization yet. Ensure this model has " + "'get_expert_mapping' method.") # TODO: support FusedMoE with prequant and 8bit. if self.pre_quant: raise ValueError( @@ -434,9 +430,9 @@ class BitsAndBytesModelLoader(BaseModelLoader): "BitsAndBytes 8bit quantization with FusedMoE is not " "supported yet.") # Get the corresponding weight name using module name and - # expert_params_mapping. - - for exp in self.expert_params_mapping: + # get_expert_mapping. + expert_mapping = model.get_expert_mapping() + for exp in expert_mapping: weight_name = exp[1] rep_name = name.replace("experts", "") + weight_name.removesuffix(".") @@ -468,7 +464,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): elif isinstance(module, (RowParallelLinear, )): self.column_sharded_weights_modules.append(name) elif isinstance(module, FusedMoE): - expert_mapping = self.expert_params_mapping + expert_mapping = model.get_expert_mapping() for exp in expert_mapping: if exp[-1] == "w2": weight_name = exp[1] @@ -520,13 +516,6 @@ class BitsAndBytesModelLoader(BaseModelLoader): self.is_pool_model = is_pooling_model(model) self.modules_mapping = ParamMapping(get_packed_modules_mapping(model)) - if is_moe_model(model): - self.expert_params_mapping = get_moe_expert_mapping(model) - if not self.expert_params_mapping: - raise AttributeError( - f"MoE Model {type(model).__name__} does not support " - "BitsAndBytes quantization yet. Ensure this model has " - "'get_expert_mapping' method.") # For some models like Molmo, we need to use hf_to_vllm_mapper # to ensure correct loading of weights. if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None): @@ -580,10 +569,10 @@ class BitsAndBytesModelLoader(BaseModelLoader): """ from bitsandbytes.functional import QuantState - if not self.expert_params_mapping: + if not hasattr(model, "get_expert_mapping"): return dict() - expert_mapping = self.expert_params_mapping + expert_mapping = model.get_expert_mapping() expert_qs_dict = {} for name, module in model.named_modules(): if not isinstance(module, FusedMoE): diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index b476a4f918bc3..a3eee9f065aea 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -123,10 +123,16 @@ class AyaVisionProcessingInfo(BaseProcessingInfo): return self.ctx.get_hf_config(AyaVisionConfig) def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor: - return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) + processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) - def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor: - return self.get_hf_processor(**kwargs).image_processor + # Temporary workaround since this processor has multiple image tokens + # See https://github.com/huggingface/transformers/issues/38350 + processor._check_special_mm_tokens = lambda *args, **kwargs: None + + return processor + + def get_image_processor(self) -> GotOcr2ImageProcessor: + return self.get_hf_processor().image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 531018625478b..544de5fe02d35 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -214,25 +214,25 @@ class DeepseekVL2MultiModalProcessor( mm_kwargs: Mapping[str, object], tok_kwargs: Mapping[str, object], ) -> BatchFeature: - if not mm_data: + if mm_data: + processed_outputs = self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(prompt=prompt, **mm_data), + dict(**mm_kwargs, **tok_kwargs), + ) + pixel_values = processed_outputs["pixel_values"] + # split pixel values into patches corresponding to each image + images_spatial_crop = processed_outputs["images_spatial_crop"] + patches_per_image = [ + x.prod().item() + 1 for x in images_spatial_crop + ] + pixel_values = pixel_values.split(patches_per_image) + processed_outputs["pixel_values"] = pixel_values + else: tokenizer = self.info.get_tokenizer() - return tokenizer(prompt, - add_special_tokens=True, - return_tensors="pt") - - processed_outputs = super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - tok_kwargs=tok_kwargs, - ) - - pixel_values = processed_outputs["pixel_values"] - # split pixel values into patches corresponding to each image - images_spatial_crop = processed_outputs["images_spatial_crop"] - patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop] - pixel_values = pixel_values.split(patches_per_image) - processed_outputs["pixel_values"] = pixel_values + processed_outputs = tokenizer(prompt, + add_special_tokens=True, + return_tensors="pt") return processed_outputs diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 56e456c2f1f2a..399c739f408ee 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -761,6 +761,12 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only): class Florence2ProcessingInfo(BaseProcessingInfo): + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor(self): + return self.ctx.get_hf_processor() + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index b61e0361fe8c3..7e1d478562a4c 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -83,8 +83,8 @@ class FuyuProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object): return self.ctx.get_hf_processor(FuyuProcessor, **kwargs) - def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor: - return self.get_hf_processor(**kwargs).image_processor + def get_image_processor(self) -> FuyuImageProcessor: + return self.get_hf_processor().image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index e16c03c8d3b57..a58b32793dbef 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -46,7 +46,6 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsQuant from .utils import (AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, make_layers, maybe_prefix) @@ -69,7 +68,6 @@ class Gemma3nAltUp(nn.Module): altup_num_inputs: int, altup_coef_clip: float, altup_active_idx: int, - quant_config: QuantizationConfig, prefix: str, ): super().__init__() @@ -82,7 +80,6 @@ class Gemma3nAltUp(nn.Module): altup_num_inputs, altup_num_inputs, bias=False, - quant_config=quant_config, prefix=f"{prefix}.correction_coefs", return_bias=False, ) @@ -90,7 +87,6 @@ class Gemma3nAltUp(nn.Module): altup_num_inputs, altup_num_inputs**2, bias=False, - quant_config=quant_config, prefix=f"{prefix}.prediction_coefs", return_bias=False, ) @@ -98,7 +94,6 @@ class Gemma3nAltUp(nn.Module): hidden_size, altup_num_inputs, bias=False, - quant_config=quant_config, prefix=f"{prefix}.modality_router", return_bias=False, ) @@ -405,7 +400,6 @@ class Gemma3nDecoderLayer(nn.Module): altup_num_inputs=config.altup_num_inputs, altup_coef_clip=config.altup_coef_clip, altup_active_idx=config.altup_active_idx, - quant_config=quant_config, prefix=f"{prefix}.altup", ) self.self_attn = Gemma3nAttention( @@ -533,7 +527,7 @@ class Gemma3nDecoderLayer(nn.Module): @support_torch_compile -class Gemma3nTextModel(nn.Module, SupportsQuant): +class Gemma3nTextModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -546,7 +540,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, - quant_config=quant_config, prefix=f"{prefix}.embed_tokens", ) self.embed_scale = torch.tensor( @@ -556,7 +549,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): self.embed_tokens_per_layer = VocabParallelEmbedding( config.vocab_size_per_layer_input, config.num_hidden_layers * config.hidden_size_per_layer_input, - quant_config=quant_config, prefix=f"{prefix}.per_layer_embed_tokens", ) self.embed_scale_per_layer = torch.tensor( @@ -590,7 +582,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): gather_output=True, return_bias=False, quant_config=quant_config, - prefix=f"{prefix}.altup_projections.{idx-1}", + prefix=f"{prefix}.{idx-1}.altup_projections", ) for idx in range(1, self.config.altup_num_inputs) ]) self.altup_unembed_projections = nn.ModuleList([ @@ -601,7 +593,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): gather_output=True, return_bias=False, quant_config=quant_config, - prefix=f"{prefix}.altup_unembed_projections.{idx-1}", + prefix=f"{prefix}.{idx-1}.altup_unembed_projections", ) for idx in range(1, self.config.altup_num_inputs) ]) @@ -782,7 +774,7 @@ class Gemma3nModel(nn.Module): **kwargs) -class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant): +class Gemma3nForConditionalGeneration(nn.Module): packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 7c9840790fe3e..ae1bf22c704e5 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -809,11 +809,11 @@ class Glm4vProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": 1} - def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor: - return self.get_hf_processor(**kwargs).image_processor + def get_image_processor(self) -> Glm4vImageProcessor: + return self.get_hf_processor().image_processor - def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor: - return self.get_hf_processor(**kwargs).video_processor + def get_video_processor(self) -> Glm4vVideoProcessor: + return self.get_hf_processor().video_processor def _get_vision_info( self, @@ -937,7 +937,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo): total_frames: int) -> list[int]: video_processor = self.get_video_processor() - video_fps = metadata.get("fps", video_processor.fps) + video_fps = metadata.get("fps", 2.0) meta_frames = metadata.get("total_num_frames", total_frames) max_frame_idx = meta_frames - 1 duration = metadata.get("duration", @@ -1120,7 +1120,11 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): video_placeholder, ) - video_grid_thw_lst.append(video_outputs["video_grid_thw"]) + grid_t = len(video_outputs["video_grid_thw"]) + _, grid_h, grid_w = video_outputs["video_grid_thw"][0] + grid_thw = torch.tensor([[grid_t, grid_h, grid_w]]) + + video_grid_thw_lst.append(grid_thw) pixel_values_videos_lst.append( video_outputs["pixel_values_videos"]) video_outputs = dict( diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index c3e4f81597adb..467b074f37753 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -392,7 +392,21 @@ class H2OVLProcessor(BaseInternVLProcessor): class H2OVLProcessingInfo(BaseInternVLProcessingInfo): - def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor: + def get_hf_processor( + self, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + **kwargs: object, + ) -> H2OVLProcessor: + if min_dynamic_patch is not None: + kwargs["min_dynamic_patch"] = min_dynamic_patch + if max_dynamic_patch is not None: + kwargs["max_dynamic_patch"] = max_dynamic_patch + if dynamic_image_size is not None: + kwargs["dynamic_image_size"] = dynamic_image_size + return self.ctx.init_processor( H2OVLProcessor, config=self.get_hf_config(), diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index e5c94c7f3a706..3e8e50b35c0b7 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -25,7 +25,8 @@ import torch import torch.nn as nn from timm.layers import LayerNorm, LayerNorm2d from timm.models.regnet import RegStage -from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig +from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig, + SiglipVisionConfig) from transformers.modeling_utils import no_init_weights from vllm.config import VllmConfig @@ -79,9 +80,26 @@ HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs] class HCXVisionProcessingInfo(BaseProcessingInfo): + def get_hf_config(self): + return self.ctx.get_hf_config() + def get_vision_encoder_info(self): return get_vision_encoder_info(self.get_hf_config()) + def get_hf_processor( + self, + **kwargs: object, + ): + processor_cls = type( + AutoProcessor.from_pretrained( + self.ctx.model_config.model, + trust_remote_code=self.ctx.model_config.trust_remote_code, + )) + return self.ctx.get_hf_processor( + processor_cls, + **kwargs, + ) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 3c01789b90066..6e991d99b9638 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -88,7 +88,15 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs] class Idefics3ProcessingInfo(BaseProcessingInfo): - def get_hf_processor(self, **kwargs: object) -> Idefics3Processor: + def get_hf_processor( + self, + *, + size: Optional[dict[str, int]] = None, + **kwargs: object, + ) -> Idefics3Processor: + if size is not None: + kwargs["size"] = size + return self.ctx.get_hf_processor(Idefics3Processor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 8e766dd4c4768..a0e98ca3f8155 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -665,7 +665,14 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod - def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor: + def get_hf_processor( + self, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + **kwargs: object, + ) -> BaseInternVLProcessor: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: @@ -875,12 +882,27 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): return max(max_frames_per_video, 1) - def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: + def get_hf_processor( + self, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + **kwargs: object, + ) -> InternVLProcessor: + if min_dynamic_patch is not None: + kwargs["min_dynamic_patch"] = min_dynamic_patch + if max_dynamic_patch is not None: + kwargs["max_dynamic_patch"] = max_dynamic_patch + if dynamic_image_size is not None: + kwargs["dynamic_image_size"] = dynamic_image_size + + kwargs["video_token"] = self.get_video_token() + return self.ctx.init_processor( InternVLProcessor, config=self.get_hf_config(), tokenizer=self.get_tokenizer(), - video_token=self.get_video_token(), **kwargs, ) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 4d8aa8de0f0b1..892d970aaade0 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -44,6 +44,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope +from vllm.transformers_utils.processor import ( + cached_image_processor_from_config) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -978,8 +980,72 @@ class KeyeMultiModalDataParser(MultiModalDataParser): class KeyeProcessingInfo(BaseProcessingInfo): - def get_image_processor(self, **kwargs: object): - return self.get_hf_processor(**kwargs).image_processor + def get_hf_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + **kwargs: object, + ): + return self.ctx.get_hf_processor( + image_processor=self.get_image_processor( + min_pixels=min_pixels, + max_pixels=max_pixels, + size=size, + ), + **kwargs, + ) + + def _get_image_processor_kwargs( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + **kwargs: object, + ): + if self.ctx.model_config.mm_processor_kwargs: + kwargs.update(self.ctx.model_config.mm_processor_kwargs) + + if min_pixels is not None: + kwargs["min_pixels"] = min_pixels + + if size is None: + size = {"shortest_edge": min_pixels} + else: + size["shortest_edge"] = min_pixels + + if max_pixels is not None: + kwargs["max_pixels"] = max_pixels + + if size is None: + size = {"longest_edge": max_pixels} + else: + size["longest_edge"] = max_pixels + + if size is not None: + kwargs["size"] = size + + return kwargs + + def get_image_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + **kwargs: object, + ): + return cached_image_processor_from_config( + self.ctx.model_config, + **self._get_image_processor_kwargs( + min_pixels=min_pixels, + max_pixels=max_pixels, + size=size, + **kwargs, + ), + ) def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} @@ -1180,6 +1246,20 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: return KeyeMultiModalDataParser() + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs) + return self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(text=prompt, **mm_data), + dict(**mm_kwargs, **tok_kwargs), + ) + def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 572930c39a846..71275f0d58579 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -51,25 +51,6 @@ class LlamaDecoderLayer(LlamaDecoderLayer): self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - if getattr(config, "norm_before_residual", False): - self._residual_norm = self._norm_before_residual - else: - self._residual_norm = self._norm_after_residual - - def _norm_before_residual( - self, - hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - hidden_states = self.hidden_norm(hidden_states) - residual = hidden_states - return hidden_states, residual - - def _norm_after_residual( - self, - hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - residual = hidden_states - hidden_states = self.hidden_norm(hidden_states) - return hidden_states, residual - def forward( self, positions: torch.Tensor, @@ -78,10 +59,9 @@ class LlamaDecoderLayer(LlamaDecoderLayer): residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: + residual = hidden_states embeds = self.input_layernorm(embeds) - - hidden_states, residual = self._residual_norm( - hidden_states=hidden_states) + hidden_states = self.hidden_norm(hidden_states) hidden_states = torch.cat([embeds, hidden_states], dim=-1) # Self Attention @@ -122,7 +102,7 @@ class LlamaModel(nn.Module): self.layers = nn.ModuleList([ LlamaDecoderLayer( - config=self.config, + self.config, prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"), ) ]) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c863ba406422d..0126ace09e707 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -8,9 +8,11 @@ from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, import torch import torch.nn as nn +from packaging.version import Version from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, SiglipVisionConfig) +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -305,14 +307,29 @@ class PixtralHFMultiModalProcessor( pixel_values = processed_outputs.get("pixel_values") if pixel_values is not None: - # Avoid padding since we need the output for each image to be - # independent of other images for the cache to work correctly - image_sizes = processed_outputs["image_sizes"] - assert len(pixel_values) == len(image_sizes) + # Before/after https://github.com/huggingface/transformers/pull/35122 + if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"): + images = mm_data["images"] + assert isinstance(images, list) - processed_outputs["pixel_values"] = [ - p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes) - ] + # Original output: (1, num_images, C, H, W) + # New output: (num_images, C, H, W) + assert (isinstance(pixel_values, list) + and len(pixel_values) == 1) + assert (isinstance(pixel_values[0], list) + and len(pixel_values[0]) == len(images)) + + processed_outputs["pixel_values"] = pixel_values[0] + else: + # Avoid padding since we need the output for each image to be + # independent of other images for the cache to work correctly + image_sizes = processed_outputs["image_sizes"] + assert len(pixel_values) == len(image_sizes) + + processed_outputs["pixel_values"] = [ + p[:, :h, :w] + for p, (h, w) in zip(pixel_values, image_sizes) + ] return processed_outputs @@ -767,10 +784,17 @@ class MantisProcessingInfo(LlavaProcessingInfo): vision_info = self.get_vision_encoder_info() kwargs.setdefault("patch_size", vision_info.get_patch_size()) - kwargs.setdefault( - "vision_feature_select_strategy", - hf_config.vision_feature_select_strategy, - ) + + if Version(TRANSFORMERS_VERSION) < Version("4.48"): + # BUG: num_additional_image_tokens = 0 but treated as 1, + # so we set vision_feature_select_strategy to None to offset this + kwargs.setdefault("vision_feature_select_strategy", None) + else: + # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 + kwargs.setdefault( + "vision_feature_select_strategy", + hf_config.vision_feature_select_strategy, + ) return self.ctx.get_hf_processor(LlavaProcessor, **kwargs) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index e172758b2f2c5..70f2d4a6420b9 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -331,8 +331,10 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): return hf_processor - def get_image_processor(self, **kwargs: object): - return self.get_hf_processor(**kwargs).image_processor + def get_image_processor(self): + hf_processor = self.get_hf_processor() + image_processor = hf_processor.image_processor # type: ignore + return image_processor def get_model_version(self): return get_version_by_config(self.get_hf_config()) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 924f10d82b381..dea85d320adfd 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -533,7 +533,7 @@ class Mllama4ProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object) -> Llama4Processor: return self.ctx.get_hf_processor(Llama4Processor, - use_fast=kwargs.pop("use_fast", True), + use_fast=True, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index b90cb9b39a60b..5d0513d707413 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -137,16 +137,34 @@ class NemotronVLProcessor(InternVLProcessor): class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): """Processing info for Nemotron VL models.""" - def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor: + def get_hf_processor( + self, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + **kwargs: object, + ) -> NemotronVLProcessor: + if min_dynamic_patch is not None: + kwargs["min_dynamic_patch"] = min_dynamic_patch + if max_dynamic_patch is not None: + kwargs["max_dynamic_patch"] = max_dynamic_patch + if dynamic_image_size is not None: + kwargs["dynamic_image_size"] = dynamic_image_size + + image_processor = self.get_image_processor() return self.ctx.init_processor( NemotronVLProcessor, config=self.get_hf_config(), tokenizer=self.get_tokenizer(), - image_processor=self.get_image_processor(), + image_processor=image_processor, **kwargs, ) - def get_image_processor(self, **kwargs: object): + def get_image_processor( + self, + **kwargs: object, + ): return cached_image_processor_from_config( self.ctx.model_config, **kwargs, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 4bea1392a6814..2f7f8e437f0ad 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -63,7 +63,21 @@ class NVLMProcessor(BaseInternVLProcessor): class NVLMProcessingInfo(BaseInternVLProcessingInfo): - def get_hf_processor(self, **kwargs: object) -> NVLMProcessor: + def get_hf_processor( + self, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + **kwargs: object, + ) -> NVLMProcessor: + if min_dynamic_patch is not None: + kwargs["min_dynamic_patch"] = min_dynamic_patch + if max_dynamic_patch is not None: + kwargs["max_dynamic_patch"] = max_dynamic_patch + if dynamic_image_size is not None: + kwargs["dynamic_image_size"] = dynamic_image_size + return self.ctx.init_processor( NVLMProcessor, config=self.get_hf_config(), diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 6b27980e0b0c3..c8b528048b557 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -25,7 +25,7 @@ import torch import torch.nn as nn from torch import Tensor from torch.nn.functional import gumbel_softmax, pad, softmax -from transformers import BatchFeature, PretrainedConfig +from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear @@ -245,12 +245,11 @@ class VisualEmbedding(torch.nn.Embedding): class OvisProcessingInfo(BaseProcessingInfo): - def get_hf_processor(self, **kwargs: object): + def get_hf_processor(self, **kwargs): return self.ctx.get_hf_processor( OvisProcessor, image_pad_token=self.get_image_pad_token(), image_segment_len=self.get_image_segment_len(), - **kwargs, ) def get_image_segment_len(self) -> int: @@ -270,6 +269,9 @@ class OvisProcessingInfo(BaseProcessingInfo): text_model_type = hf_text_config.model_type return IMAGE_PAD_TOKEN_MAP.get(text_model_type) + def get_image_processor(self) -> BaseImageProcessor: + return self.get_hf_processor().image_processor # type: ignore + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 9ef4f8371eb3d..aa739f22fd7bf 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -318,6 +318,17 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): class Phi3VProcessingInfo(BaseProcessingInfo): + def get_hf_processor( + self, + *, + num_crops: Optional[int] = None, + **kwargs: object, + ) -> ProcessorMixin: + if num_crops is not None: + kwargs["num_crops"] = num_crops + + return self.ctx.get_hf_processor(**kwargs) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index e13b8276bf17a..432b707a61591 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -696,12 +696,19 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> Phi4MultimodalConfig: return self.ctx.get_hf_config(Phi4MultimodalConfig) - def get_hf_processor(self, **kwargs: object) -> Phi4MMProcessor: - return self.ctx.get_hf_processor(Phi4MMProcessor, **kwargs) + def get_hf_processor( + self, + *, + dynamic_hd: Optional[int] = None, + **kwargs: object, + ) -> Phi4MMProcessor: + if dynamic_hd is not None: + kwargs["dynamic_hd"] = dynamic_hd - def get_feature_extractor( - self, **kwargs: object) -> Phi4MultimodalFeatureExtractor: - return self.get_hf_processor(**kwargs).audio_processor + return self.ctx.get_hf_processor(**kwargs) + + def get_feature_extractor(self) -> Phi4MultimodalFeatureExtractor: + return self.get_hf_processor().audio_processor def get_image_processor( self, @@ -1000,7 +1007,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): if audio_data: audio_features = processed_outputs['audio_input_features'] - sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate + sr = self.info.get_feature_extractor().sampling_rate feature_sizes = [ self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data @@ -1036,8 +1043,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): audio_token_id = tokenizer.vocab[tokenizer.audio_token] hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - audio_processor = self.info.get_feature_extractor( - **hf_processor_mm_kwargs) + audio_processor = self.info.get_feature_extractor() def get_image_replacement_phi4mm(item_idx: int): images = mm_items.get_items( diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 73e8446e6dea7..9b61c3634d841 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -459,6 +459,17 @@ def cat_with_pad(tensors, dim, padding_value=0): class Phi4MMProcessingInfo(BaseProcessingInfo): + def get_hf_processor( + self, + *, + dynamic_hd: Optional[int] = None, + **kwargs: object, + ) -> ProcessorMixin: + if dynamic_hd is not None: + kwargs["dynamic_hd"] = dynamic_hd + + return self.ctx.get_hf_processor(**kwargs) + @property def image_tokens(self) -> list[str]: return [f"<|image_{i+1}|>" for i in range(100)] @@ -476,9 +487,8 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): image_processor = processor.image_processor return image_processor.dynamic_hd - def get_feature_extractor(self, - **kwargs: object) -> SequenceFeatureExtractor: - return self.get_hf_processor(**kwargs).audio_processor + def get_feature_extractor(self) -> SequenceFeatureExtractor: + return self.get_hf_processor().audio_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None, "image": None} @@ -759,7 +769,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") - sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate + sr = self.info.get_feature_extractor().sampling_rate if (audio_data := mm_data.get("audios", [])): mm_data['audios'] = [(data, sr) for data in audio_data] @@ -806,8 +816,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): ) -> Sequence[PromptUpdate]: image_tokens: list[str] = self.info.image_tokens # type: ignore audio_tokens: list[str] = self.info.audio_tokens # type: ignore - feature_extractor = self.info.get_feature_extractor( - **hf_processor_mm_kwargs) + feature_extractor = self.info.get_feature_extractor() hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) def get_image_replacement_phi4mm(item_idx: int): diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 0e7507a4570be..23f65b99c22ce 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -330,8 +330,6 @@ class Qwen2Model(nn.Module): else: self.norm = PPMissingLayer() - self.aux_hidden_state_layers: tuple[int] = tuple() - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -352,25 +350,18 @@ class Qwen2Model(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - - aux_hidden_states = [] - for idx, layer in enumerate( - self.layers[self.start_layer:self.end_layer]): - if idx in self.aux_hidden_state_layers: - aux_hidden_states.append(hidden_states + residual) - hidden_states, residual = layer(positions, hidden_states, residual) - + for layer in self.layers[self.start_layer:self.end_layer]: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, "residual": residual }) - hidden_states, _ = self.norm(hidden_states, residual) - - if len(aux_hidden_states) > 0: - return hidden_states, aux_hidden_states - return hidden_states def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index b9fed79c84cdd..c5a5c10d9509f 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -132,15 +132,50 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo, def get_hf_config(self): return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config - def get_hf_processor(self, **kwargs: object) -> Qwen2_5OmniProcessor: - return self.ctx.get_hf_processor( - Qwen2_5OmniProcessor, - use_fast=kwargs.pop("use_fast", True), + def get_hf_processor( + self, + *, + sampling_rate: Optional[int] = None, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + fps: Optional[Union[float, list[float]]] = None, + **kwargs: object, + ) -> Qwen2_5OmniProcessor: + if fps is not None: + kwargs["fps"] = fps + + # Monkey patch for Transformers v4.53 + processor_class = Qwen2_5OmniProcessor + if processor_class.image_processor_class != "AutoImageProcessor": + processor_class.image_processor_class = "AutoImageProcessor" + if processor_class.video_processor_class != "AutoVideoProcessor": + processor_class.video_processor_class = "AutoVideoProcessor" + + processor = self.ctx.get_hf_processor( + processor_class, + image_processor=self.get_image_processor(min_pixels=min_pixels, + max_pixels=max_pixels, + size=size, + use_fast=kwargs.get( + "use_fast", True)), **kwargs, ) + if not hasattr(processor, "audio_token"): + processor.audio_token = "<|AUDIO|>" + if not hasattr(processor, "image_token"): + processor.image_token = "<|IMAGE|>" + if not hasattr(processor, "video_token"): + processor.video_token = "<|VIDEO|>" + return processor - def get_feature_extractor(self, **kwargs: object): - hf_processor = self.get_hf_processor(**kwargs) + def get_feature_extractor( + self, + *, + sampling_rate: Optional[int] = None, + **kwargs: object, + ): + hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 04e64422d2e0b..8ae096536fdc5 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -43,10 +43,9 @@ from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.activation import get_act_and_mul_fn +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -172,12 +171,16 @@ class Qwen2_5_VisionMLP(nn.Module): quant_config: Optional[QuantizationConfig] = None, prefix: str = ""): super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - input_size=in_features, - output_sizes=[hidden_features] * 2, # [gate_proj, up_proj] - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") + self.gate_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_proj") + self.up_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.up_proj") self.down_proj = RowParallelLinear(hidden_features, in_features, bias=bias, @@ -186,9 +189,10 @@ class Qwen2_5_VisionMLP(nn.Module): self.act_fn = act_fn def forward(self, x: torch.Tensor): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x_down, _ = self.down_proj(x) + x_gate, _ = self.gate_proj(x) + x_gate = self.act_fn(x_gate) + x_up, _ = self.up_proj(x) + x_down, _ = self.down_proj(x_gate * x_up) return x_down @@ -536,14 +540,14 @@ class Qwen2_5_VisionTransformer(nn.Module): self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) self.blocks = nn.ModuleList([ - Qwen2_5_VisionBlock(dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=get_act_and_mul_fn( - vision_config.hidden_act), - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") + Qwen2_5_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") for layer_idx in range(depth) ]) self.merger = Qwen2_5_VisionPatchMerger( @@ -748,8 +752,6 @@ class Qwen2_5_VisionTransformer(nn.Module): ("attn.qkv.", "attn.q.", "q"), ("attn.qkv.", "attn.k.", "k"), ("attn.qkv.", "attn.v.", "v"), - ("mlp.gate_up_proj.", "mlp.gate_proj.", 0), - ("mlp.gate_up_proj.", "mlp.up_proj.", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: set[str] = set() @@ -778,10 +780,25 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2_5_VLConfig) - def get_hf_processor(self, **kwargs: object) -> Qwen2_5_VLProcessor: + def get_hf_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + fps: Optional[Union[float, list[float]]] = None, + **kwargs: object, + ) -> Qwen2_5_VLProcessor: + if fps is not None: + kwargs["fps"] = fps + return self.ctx.get_hf_processor( Qwen2_5_VLProcessor, - use_fast=kwargs.pop("use_fast", True), + image_processor=self.get_image_processor(min_pixels=min_pixels, + max_pixels=max_pixels, + size=size, + use_fast=kwargs.get( + "use_fast", True)), **kwargs, ) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 3ef55cd704cf0..d7fec30acd8d3 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -86,12 +86,22 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2AudioConfig) - def get_hf_processor(self, **kwargs: object) -> Qwen2AudioProcessor: + def get_hf_processor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + **kwargs: object, + ) -> Qwen2AudioProcessor: return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs) - def get_feature_extractor(self, - **kwargs: object) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor(**kwargs) + def get_feature_extractor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 4e8ea8e449133..ad63bb4af4e9d 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -69,6 +69,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope +from vllm.transformers_utils.processor import ( + cached_image_processor_from_config) from vllm.transformers_utils.tokenizer import AnyTokenizer from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -750,15 +752,73 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2VLConfig) - def get_hf_processor(self, **kwargs: object) -> Qwen2VLProcessor: + def get_hf_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + **kwargs: object, + ) -> Qwen2VLProcessor: return self.ctx.get_hf_processor( Qwen2VLProcessor, - use_fast=kwargs.pop("use_fast", True), + image_processor=self.get_image_processor(min_pixels=min_pixels, + max_pixels=max_pixels, + size=size, + use_fast=kwargs.get( + "use_fast", True)), **kwargs, ) - def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessor: - return self.get_hf_processor(**kwargs).image_processor + def _get_image_processor_kwargs( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + **kwargs: object, + ): + mm_config = self.ctx.model_config.get_multimodal_config() + if mm_config.mm_processor_kwargs: + kwargs.update(mm_config.mm_processor_kwargs) + + if min_pixels is not None: + kwargs["min_pixels"] = min_pixels + + if size is None: + size = {"shortest_edge": min_pixels} + else: + size["shortest_edge"] = min_pixels + + if max_pixels is not None: + kwargs["max_pixels"] = max_pixels + + if size is None: + size = {"longest_edge": max_pixels} + else: + size["longest_edge"] = max_pixels + + if size is not None: + kwargs["size"] = size + + return kwargs + + def get_image_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + **kwargs: object, + ) -> Qwen2VLImageProcessor: + kwargs["use_fast"] = kwargs.get("use_fast", True) + return cached_image_processor_from_config( + self.ctx.model_config, + **self._get_image_processor_kwargs(min_pixels=min_pixels, + max_pixels=max_pixels, + size=size, + **kwargs), + ) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} @@ -963,6 +1023,20 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] def _get_data_parser(self) -> MultiModalDataParser: return Qwen2VLMultiModalDataParser() + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs) + return self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(text=prompt, **mm_data), + dict(**mm_kwargs, **tok_kwargs), + ) + def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index d2ae8959b103d..393ce41a91a00 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -288,13 +288,6 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) - def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None: - self.model.aux_hidden_state_layers = layers - - def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]: - num_layers = len(self.model.layers) - return (2, num_layers // 2, num_layers - 3) - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 0c5d87a7dc472..848c04b9b32f7 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -270,9 +270,8 @@ _TRANSFORMERS_SUPPORTED_MODELS = { } _TRANSFORMERS_BACKEND_MODELS = { - "TransformersModel": ("transformers", "TransformersModel"), - "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501 + "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), } # yapf: enable diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index c76aabcd27ccb..5ae5c0bc1d5dc 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -7,8 +7,9 @@ # Copyright (c) 2025 Skywork # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- +from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, TypedDict, Union +from typing import Literal, Optional, TypedDict, TypeVar, Union import torch import torch.nn as nn @@ -231,7 +232,7 @@ def image_to_pixel_values_skyworkr1v( return pixel_values -class SkyworkR1VProcessor: +class BaseSkyworkR1VProcessor(ABC): """ This model doesn't define its own HF processor, so we implement our own one here. @@ -278,18 +279,17 @@ class SkyworkR1VProcessor: self.use_thumbnail: bool = config.use_thumbnail @property + @abstractmethod def image_token_id(self) -> int: - return self.tokenizer.get_vocab()[IMG_CONTEXT] + raise NotImplementedError + @abstractmethod def get_image_repl( self, feature_size: int, num_patches: Optional[int], ) -> PromptUpdateDetails[str]: - repl_features = IMG_CONTEXT * feature_size - repl_full = IMG_START + repl_features + IMG_END - - return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) + raise NotImplementedError def resolve_min_max_num( self, @@ -426,15 +426,35 @@ class SkyworkR1VProcessor: } -class SkyworkR1VProcessingInfo(BaseProcessingInfo): +class SkyworkR1VProcessor(BaseSkyworkR1VProcessor): - def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor: - return self.ctx.init_processor( - SkyworkR1VProcessor, - config=self.get_hf_config(), - tokenizer=self.get_tokenizer(), - **kwargs, - ) + @property + def image_token_id(self) -> int: + return self.tokenizer.get_vocab()[IMG_CONTEXT] + + def get_image_repl( + self, + feature_size: int, + num_patches: Optional[int], + ) -> PromptUpdateDetails[str]: + repl_features = IMG_CONTEXT * feature_size + repl_full = IMG_START + repl_features + IMG_END + + return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) + + +class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): + + @abstractmethod + def get_hf_processor( + self, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + **kwargs: object, + ) -> BaseSkyworkR1VProcessor: + raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -444,7 +464,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Optional[SkyworkR1VProcessor], + processor: Optional[BaseSkyworkR1VProcessor], ) -> int: if processor is None: processor = self.get_hf_processor() @@ -480,8 +500,10 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo): return largest_feature_pinpoint -class SkyworkR1VDummyInputsBuilder( - BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]): +_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo) + + +class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -505,8 +527,7 @@ class SkyworkR1VDummyInputsBuilder( } -class SkyworkR1VMultiModalProcessor( - BaseMultiModalProcessor[SkyworkR1VProcessingInfo]): +class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]): def _call_hf_processor( self, @@ -596,6 +617,31 @@ class SkyworkR1VMultiModalProcessor( ] +class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo): + + def get_hf_processor( + self, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + **kwargs: object, + ) -> SkyworkR1VProcessor: + if min_dynamic_patch is not None: + kwargs["min_dynamic_patch"] = min_dynamic_patch + if max_dynamic_patch is not None: + kwargs["max_dynamic_patch"] = max_dynamic_patch + if dynamic_image_size is not None: + kwargs["dynamic_image_size"] = dynamic_image_size + + return self.ctx.init_processor( + SkyworkR1VProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, + ) + + @MULTIMODAL_REGISTRY.register_processor( SkyworkR1VMultiModalProcessor, info=SkyworkR1VProcessingInfo, diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index 2adfad67152b3..0f22ba5b406ce 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -19,7 +19,15 @@ from .idefics3 import Idefics3ProcessingInfo class SmolVLMProcessingInfo(Idefics3ProcessingInfo): - def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor: + def get_hf_processor( + self, + *, + max_image_size: Optional[dict[str, int]] = None, + **kwargs: object, + ) -> SmolVLMProcessor: + if max_image_size is not None: + kwargs["max_image_size"] = max_image_size + return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs) def _get_image_token( diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 70cf5e95a54e1..979d789b330cf 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -178,11 +178,13 @@ class TarsierProcessingInfo(BaseProcessingInfo): return get_vision_encoder_info(self.get_hf_config()) def get_hf_processor(self, **kwargs: object) -> TarsierProcessor: - vision_info = self.get_vision_encoder_info() - - kwargs.setdefault("patch_size", vision_info.get_patch_size()) - - return self.ctx.get_hf_processor(TarsierProcessor, **kwargs) + hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs) + # Patch for patch_size if needed (copied from vLLM LLaVA) + if hasattr(hf_processor, + 'patch_size') and hf_processor.patch_size is None: + patch_size = self.get_vision_encoder_info().get_patch_size() + hf_processor.patch_size = patch_size + return hf_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 5059d1e1d9fea..8cd95605cdfae 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -48,6 +48,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import is_list_of from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP, @@ -188,6 +189,10 @@ class MultiModalProcessingInfo(BaseProcessingInfo): image_tokens = mm_tokens["num_image_tokens"][0] return image_tokens + def get_hf_processor(self): + processor = cached_get_processor(self.ctx.model_config.model) + return processor + def get_max_image_size(self): return 10_000, 10_000 # hardcode for arbitrary very large size @@ -651,18 +656,6 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) -@support_torch_compile -class TransformersModel(TransformersBase): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - # Add `model.` prefix for base model checkpoints - "": "model.", - # Remove `model.` from places it should not be - "model.model.": "model.", - "model.score": "score", - }) - - @support_torch_compile class TransformersForCausalLM(TransformersBase): diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index bef34c1be49fe..a4569ccd5a845 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -71,7 +71,13 @@ UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, class UltravoxProcessingInfo(BaseProcessingInfo): - def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + def get_hf_processor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + **kwargs: object, + ) -> ProcessorMixin: config = self.ctx.model_config.hf_config hf_processor = self.ctx.get_hf_processor(**kwargs) @@ -83,9 +89,13 @@ class UltravoxProcessingInfo(BaseProcessingInfo): return hf_processor - def get_feature_extractor(self, - **kwargs: object) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor(**kwargs) + def get_feature_extractor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) audio_processor = hf_processor.audio_processor # type: ignore feature_extractor = audio_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) @@ -146,7 +156,7 @@ class UltravoxMultiModalProcessor( audios = mm_data.pop("audios", []) assert isinstance(audios, list) - feature_extractor = self.info.get_feature_extractor(**mm_kwargs) + feature_extractor = self.info.get_feature_extractor() mm_kwargs = dict( **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index ca02ecd828ba3..d7bafb9ef84d9 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -623,22 +623,23 @@ class WhisperProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> WhisperConfig: return self.ctx.get_hf_config(WhisperConfig) - def get_hf_processor(self, **kwargs: object) -> WhisperProcessor: - # HACK: Transformers 4.53.2 has issue with whisper tokenizer to + def get_hf_processor(self, + sampling_rate: Optional[int] = None + ) -> WhisperProcessor: + # HACK: Transformers 4.53.0 has issue with whisper tokenizer to # initialize processor. We use a monkeypatch to fix it here. # See: https://github.com/vllm-project/vllm/issues/20224 processor_class = WhisperProcessor tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast") if processor_class.tokenizer_class != tokenizer_class: processor_class.tokenizer_class = tokenizer_class - return self.ctx.get_hf_processor(processor_class, **kwargs) + return self.ctx.get_hf_processor(processor_class) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": 1} - def get_feature_extractor(self, - **kwargs: object) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor(**kwargs) + def get_feature_extractor(self) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor() feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor @@ -701,7 +702,7 @@ class WhisperMultiModalProcessor( tok_kwargs: Mapping[str, object], ) -> BatchFeature: if mm_data: - feature_extractor = self.info.get_feature_extractor(**mm_kwargs) + feature_extractor = self.info.get_feature_extractor() mm_data = dict(audio=mm_data.pop("audios")) mm_kwargs = dict( **mm_kwargs, diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 41ed0b09c5a2a..2b20ca2a3ba3f 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utils for model executor.""" - import copy from typing import Any, Optional @@ -10,7 +9,6 @@ import torch def set_random_seed(seed: int) -> None: from vllm.platforms import current_platform - current_platform.seed_everything(seed) @@ -31,7 +29,7 @@ def set_weight_attrs( return for key, value in weight_attrs.items(): assert not hasattr( - weight, key), f"Overwriting existing tensor attribute: {key}" + weight, key), (f"Overwriting existing tensor attribute: {key}") # NOTE(woosuk): During weight loading, we often do something like: # narrowed_tensor = param.data.narrow(0, offset, len) @@ -43,7 +41,6 @@ def set_weight_attrs( # we sync the param tensor after its weight loader is called. # TODO(woosuk): Remove this hack once we have a better solution. from vllm.platforms import current_platform - if current_platform.is_tpu() and key == "weight_loader": value = _make_synced_weight_loader(value) setattr(weight, key, value) @@ -80,17 +77,4 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: f"safely because of conflicts from {type(child).__name__}.") else: parent_map.update(child_map) - return parent_map - - -def get_moe_expert_mapping( - model: torch.nn.Module, ) -> list[tuple[str, str, int, str]]: - if parent_map := getattr(model, "get_expert_mapping", None): - return parent_map() - else: - # We only check main components instead of whole model submodules - for child in model.children(): - child_map = getattr(child, "get_expert_mapping", None) - if child_map is not None: - return child_map() - return [] + return parent_map \ No newline at end of file diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 1006c1ce4b241..a0448a80ac7c2 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -3,7 +3,6 @@ from io import BytesIO from pathlib import Path -from typing import Union import pybase64 import torch @@ -24,10 +23,9 @@ def rescale_image_size(image: Image.Image, return image +# TODO: Support customizable background color to fill in. def rgba_to_rgb( - image: Image.Image, - background_color: Union[tuple[int, int, int], list[int]] = (255, 255, 255) -) -> Image.Image: + image: Image.Image, background_color=(255, 255, 255)) -> Image.Image: """Convert an RGBA image to RGB with filled background color.""" assert image.mode == "RGBA" converted = Image.new("RGB", image.size, background_color) @@ -57,35 +55,10 @@ class ImageMediaIO(MediaIO[Image.Image]): # for flexible control. self.kwargs = kwargs - # Extract RGBA background color from kwargs if provided - # Default to white background for backward compatibility - rgba_bg = kwargs.get('rgba_background_color', (255, 255, 255)) - # Convert list to tuple for consistency - if isinstance(rgba_bg, list): - rgba_bg = tuple(rgba_bg) - - # Validate rgba_background_color format - if not (isinstance(rgba_bg, tuple) and len(rgba_bg) == 3 - and all(isinstance(c, int) and 0 <= c <= 255 - for c in rgba_bg)): - raise ValueError( - "rgba_background_color must be a list or tuple of 3 integers " - "in the range [0, 255].") - self.rgba_background_color = rgba_bg - - def _convert_image_mode(self, image: Image.Image) -> Image.Image: - """Convert image mode with custom background color.""" - if image.mode == self.image_mode: - return image - elif image.mode == "RGBA" and self.image_mode == "RGB": - return rgba_to_rgb(image, self.rgba_background_color) - else: - return convert_image_mode(image, self.image_mode) - def load_bytes(self, data: bytes) -> Image.Image: image = Image.open(BytesIO(data)) image.load() - return self._convert_image_mode(image) + return convert_image_mode(image, self.image_mode) def load_base64(self, media_type: str, data: str) -> Image.Image: return self.load_bytes(pybase64.b64decode(data, validate=True)) @@ -93,7 +66,7 @@ class ImageMediaIO(MediaIO[Image.Image]): def load_file(self, filepath: Path) -> Image.Image: image = Image.open(filepath) image.load() - return self._convert_image_mode(image) + return convert_image_mode(image, self.image_mode) def encode_base64( self, @@ -104,7 +77,7 @@ class ImageMediaIO(MediaIO[Image.Image]): image = media with BytesIO() as buffer: - image = self._convert_image_mode(image) + image = convert_image_mode(image, self.image_mode) image.save(buffer, image_format) data = buffer.getvalue() diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 0e633c2c0b6ae..fcaa48c1392a3 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -35,9 +35,8 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, NemotronConfig, NVLM_D_Config, - RWConfig, SpeculatorsConfig, - Step3TextConfig, Step3VLConfig, - UltravoxConfig) + RWConfig, Step3TextConfig, + Step3VLConfig, UltravoxConfig) # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file @@ -82,7 +81,6 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, "eagle": EAGLEConfig, - "speculators": SpeculatorsConfig, "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, "ultravox": UltravoxConfig, @@ -289,27 +287,6 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: return config -def maybe_override_with_speculators_target_model( - model: str, - tokenizer: str, - trust_remote_code: bool, - revision: Optional[str] = None) -> tuple[str, str]: - """ - If running a speculators config, override running model with target model - """ - config_dict, _ = PretrainedConfig.get_config_dict( - model, - revision=revision, - trust_remote_code=trust_remote_code, - token=_get_hf_token(), - ) - spec_config = config_dict.get("speculators_config") - # Return the target model - if spec_config is not None: - model = tokenizer = spec_config["verifier"]["name_or_path"] - return model, tokenizer - - def get_config( model: Union[str, Path], trust_remote_code: bool, @@ -368,12 +345,9 @@ def get_config( token=_get_hf_token(), **kwargs, ) + # Use custom model class if it's in our registry model_type = config_dict.get("model_type") - if model_type is None: - model_type = "speculators" if config_dict.get( - "speculators_config") is not None else model_type - if model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] config = config_class.from_pretrained( diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 64ace167a5a00..96733da726181 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -24,7 +24,6 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config -from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig, Step3VisionEncoderConfig, Step3VLConfig) @@ -45,7 +44,6 @@ __all__ = [ "NemotronHConfig", "Nemotron_Nano_VL_Config", "NVLM_D_Config", - "SpeculatorsConfig", "UltravoxConfig", "Step3VLConfig", "Step3VisionEncoderConfig", diff --git a/vllm/transformers_utils/configs/speculators/__init__.py b/vllm/transformers_utils/configs/speculators/__init__.py deleted file mode 100644 index 208f01a7cb5ee..0000000000000 --- a/vllm/transformers_utils/configs/speculators/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/transformers_utils/configs/speculators/algos.py b/vllm/transformers_utils/configs/speculators/algos.py deleted file mode 100644 index efc87b6bcf26f..0000000000000 --- a/vllm/transformers_utils/configs/speculators/algos.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -SUPPORTED_SPECULATORS_TYPES = {} - - -def register_speculator(name): - - def decorator(fn): - SUPPORTED_SPECULATORS_TYPES[name] = fn - return fn - - return decorator - - -@register_speculator("eagle3") -def update_eagle3(config_dict: dict, vllm_config: dict) -> None: - """ - Apply Eagle-3 specific configuration transformations. - - Eagle-3 specific fields: - - draft_vocab_size: Size of the draft model's vocabulary - - target_hidden_size: Hidden size of the target model - - norm_before_residual: Whether to apply norm before residual connection - """ - - vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size") - if config_dict.get("target_hidden_size") is not None: - vllm_config["target_hidden_size"] = config_dict["target_hidden_size"] - vllm_config["norm_before_residual"] = config_dict.get( - "norm_before_residual", True) - vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"] diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py deleted file mode 100644 index d7c16e180c709..0000000000000 --- a/vllm/transformers_utils/configs/speculators/base.py +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os -from typing import Any, Union - -from transformers import PretrainedConfig - -from vllm.transformers_utils.configs.speculators.algos import ( - SUPPORTED_SPECULATORS_TYPES) - -__all__ = ["SpeculatorsConfig"] - - -class SpeculatorsConfig(PretrainedConfig): - model_type = "speculators" - - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs, - ) -> "SpeculatorsConfig": - """Load speculators Eagle config and convert to vLLM format.""" - config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) - - speculators_model_type = config_dict.get("speculators_model_type") - if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES: - raise ValueError( - f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. " - "Please ensure you're loading a speculators-format model.") - - # validate fields - # TODO: @dsikka - use speculators pydantic model to validate - cls.validate_speculators_config(config_dict=config_dict) - # Convert from speculators config -> format that can be ingested by vLLM - vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict) - # Apply anything specific to the supported algorithm - algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type] - algo_updater(config_dict=config_dict, vllm_config=vllm_config) - return cls(**vllm_config) - - @classmethod - def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None: - try: - spec_config = config_dict["speculators_config"] - methods = spec_config["proposal_methods"] - first_method = methods[0] - _ = first_method["speculative_tokens"] - _ = spec_config["verifier"]["name_or_path"] - _ = config_dict["speculators_model_type"] - except (KeyError, IndexError, TypeError) as e: - raise ValueError("Invalid speculators config structure") from e - - if "transformer_layer_config" not in config_dict: - raise ValueError("Must provide transformer_layer_config") - - if not isinstance(config_dict["transformer_layer_config"], dict): - raise TypeError( - "'transformer_layer_config' must be a dictionary if provided") - - @classmethod - def convert_speculators_to_vllm( - cls, config_dict: dict[str, Any]) -> dict[str, Any]: - """ - Convert speculators config format to vLLM format. - - This method handles the translation of field names and structure - between speculators and vLLM formats. - - Returns: - Dictionary with vLLM-compatible configuration - """ - # Currently we only support one proposal method - spec_config = config_dict["speculators_config"] - first_method = spec_config.get("proposal_methods")[0] - num_lookahead_tokens = first_method.get("speculative_tokens") - - if num_lookahead_tokens is None: - raise ValueError( - "Missing 'speculative_tokens' in proposal method. " - f"Got: {first_method}") - - # Build base vLLM config - vllm_config = { - "method": config_dict.get("speculators_model_type"), - "num_lookahead_tokens": num_lookahead_tokens, - "target_model": spec_config.get("verifier")["name_or_path"] - } - vllm_config.update(config_dict["transformer_layer_config"]) - return vllm_config diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index a630d940b2578..70cd08263d372 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -4,15 +4,9 @@ from functools import lru_cache from typing import TYPE_CHECKING, Any, Optional, Union, cast -from transformers import (AutoFeatureExtractor, AutoImageProcessor, - AutoProcessor) -from transformers.feature_extraction_utils import FeatureExtractionMixin -from transformers.image_processing_utils import BaseImageProcessor from transformers.processing_utils import ProcessorMixin from typing_extensions import TypeVar -from vllm.utils import get_allowed_kwarg_only_overrides - if TYPE_CHECKING: from vllm.config import ModelConfig @@ -39,42 +33,23 @@ class HashableList(list): return hash(tuple(self)) -def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]): - if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin: - return AutoProcessor.from_pretrained - if hasattr(processor_cls, "from_pretrained"): - return processor_cls.from_pretrained - - return processor_cls - - -def _merge_mm_kwargs( - model_config: "ModelConfig", - processor_cls: Union[type, tuple[type, ...]], - /, - **kwargs, -): +def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs): mm_config = model_config.get_multimodal_config() - merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs) + base_kwargs = mm_config.mm_processor_kwargs + if base_kwargs is None: + base_kwargs = {} - factory = _get_processor_factory_fn(processor_cls) - allowed_kwargs = get_allowed_kwarg_only_overrides( - factory, - merged_kwargs, - requires_kw_only=False, - allow_var_kwargs=True, - ) + merged_kwargs = {**base_kwargs, **kwargs} # NOTE: Pythonic dict is not hashable and will raise unhashable type # error when calling `cached_get_processor`, therefore we need to # wrap it to a hashable dict. - for key, value in allowed_kwargs.items(): + for key, value in merged_kwargs.items(): if isinstance(value, dict): - allowed_kwargs[key] = HashableDict(value) + merged_kwargs[key] = HashableDict(value) if isinstance(value, list): - allowed_kwargs[key] = HashableList(value) - - return allowed_kwargs + merged_kwargs[key] = HashableList(value) + return merged_kwargs def get_processor( @@ -86,29 +61,21 @@ def get_processor( **kwargs: Any, ) -> _P: """Load a processor for the given model name via HuggingFace.""" - if revision is None: - revision = "main" + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoProcessor + + processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or + isinstance(processor_cls, tuple) else processor_cls) try: - if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin: - processor = AutoProcessor.from_pretrained( - processor_name, - *args, - revision=revision, - trust_remote_code=trust_remote_code, - **kwargs, - ) - elif issubclass(processor_cls, ProcessorMixin): - processor = processor_cls.from_pretrained( - processor_name, - *args, - revision=revision, - trust_remote_code=trust_remote_code, - **kwargs, - ) - else: - # Processors that are standalone classes unrelated to HF - processor = processor_cls(*args, **kwargs) + processor = processor_factory.from_pretrained( + processor_name, + *args, + revision=revision, + trust_remote_code=trust_remote_code, + **kwargs, + ) except ValueError as e: # If the error pertains to the processor class not existing or not # currently being imported, suggest using the --trust-remote-code flag. @@ -145,7 +112,7 @@ def cached_processor_from_config( revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, processor_cls=processor_cls, # type: ignore[arg-type] - **_merge_mm_kwargs(model_config, processor_cls, **kwargs), + **_merge_mm_kwargs(model_config, **kwargs), ) @@ -158,6 +125,10 @@ def get_feature_extractor( ): """Load an audio feature extractor for the given model name via HuggingFace.""" + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoFeatureExtractor + from transformers.feature_extraction_utils import FeatureExtractionMixin try: feature_extractor = AutoFeatureExtractor.from_pretrained( processor_name, @@ -193,7 +164,7 @@ def cached_feature_extractor_from_config( model_config.model, revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, - **_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs), + **_merge_mm_kwargs(model_config, **kwargs), ) @@ -205,6 +176,11 @@ def get_image_processor( **kwargs: Any, ): """Load an image processor for the given model name via HuggingFace.""" + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoImageProcessor + from transformers.image_processing_utils import BaseImageProcessor + try: processor = AutoImageProcessor.from_pretrained( processor_name, @@ -241,5 +217,5 @@ def cached_image_processor_from_config( model_config.model, revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, - **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs), + **_merge_mm_kwargs(model_config, **kwargs), ) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 0d3fa6b059beb..ae978c855a8e5 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -47,7 +47,7 @@ from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps from types import MappingProxyType from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, - Optional, TextIO, Tuple, TypeVar, Union, cast, overload) + Optional, Tuple, TypeVar, Union, cast, overload) from urllib.parse import urlparse from uuid import uuid4 @@ -167,10 +167,6 @@ GB_bytes = 1_000_000_000 GiB_bytes = 1 << 30 """The number of bytes in one gibibyte (GiB).""" -# ANSI color codes -CYAN = '\033[1;36m' -RESET = '\033[0;0m' - STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.half, "bfloat16": torch.bfloat16, @@ -1668,12 +1664,6 @@ class FlexibleArgumentParser(ArgumentParser): # Enable the deprecated kwarg for Python 3.12 and below def parse_known_args(self, args=None, namespace=None): - if args is not None and "--disable-log-requests" in args: - # Special case warning because the warning below won't trigger - # if –-disable-log-requests because its value is default. - logger.warning_once( - "argument '--disable-log-requests' is deprecated. This " - "will be removed in v0.12.0.") namespace, args = super().parse_known_args(args, namespace) for action in FlexibleArgumentParser._deprecated: if (hasattr(namespace, dest := action.dest) @@ -2020,6 +2010,49 @@ def supports_kw( return False +def resolve_mm_processor_kwargs( + init_kwargs: Optional[Mapping[str, object]], + inference_kwargs: Optional[Mapping[str, object]], + callable: Callable[..., object], + *, + requires_kw_only: bool = True, + allow_var_kwargs: bool = False, +) -> dict[str, Any]: + """Applies filtering to eliminate invalid mm_processor_kwargs, i.e., + those who are not explicit keywords to the given callable (of one is + given; otherwise no filtering is done), then merges the kwarg dicts, + giving priority to inference_kwargs if there are any collisions. + + In the case that no kwarg overrides are provided, returns an empty + dict so that it can still be kwarg expanded into the callable later on. + + If allow_var_kwargs=True, allows for things that can be expanded into + kwargs as long as they aren't naming collision for var_kwargs or potential + positional arguments. + """ + # Filter inference time multimodal processor kwargs provided + runtime_mm_kwargs = get_allowed_kwarg_only_overrides( + callable, + overrides=inference_kwargs, + requires_kw_only=requires_kw_only, + allow_var_kwargs=allow_var_kwargs, + ) + + # Filter init time multimodal processor kwargs provided + init_mm_kwargs = get_allowed_kwarg_only_overrides( + callable, + overrides=init_kwargs, + requires_kw_only=requires_kw_only, + allow_var_kwargs=allow_var_kwargs, + ) + + # Merge the final processor kwargs, prioritizing inference + # time values over the initialization time values. + mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs} + + return mm_processor_kwargs + + def get_allowed_kwarg_only_overrides( callable: Callable[..., object], overrides: Optional[Mapping[str, object]], @@ -2794,9 +2827,6 @@ def make_zmq_socket( if linger is not None: socket.setsockopt(zmq.LINGER, linger) - if socket_type == zmq.XPUB: - socket.setsockopt(zmq.XPUB_VERBOSE, True) - # Determine if the path is a TCP socket with an IPv6 address. # Enable IPv6 on the zmq socket if so. scheme, host, _ = split_zmq_path(path) @@ -3271,52 +3301,3 @@ def set_process_title(name: str, else: name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}" setproctitle.setproctitle(name) - - -def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: - """Prepend each output line with process-specific prefix""" - - prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " - file_write = file.write - - def write_with_prefix(s: str): - if not s: - return - if file.start_new_line: # type: ignore[attr-defined] - file_write(prefix) - idx = 0 - while (next_idx := s.find('\n', idx)) != -1: - next_idx += 1 - file_write(s[idx:next_idx]) - if next_idx == len(s): - file.start_new_line = True # type: ignore[attr-defined] - return - file_write(prefix) - idx = next_idx - file_write(s[idx:]) - file.start_new_line = False # type: ignore[attr-defined] - - file.start_new_line = True # type: ignore[attr-defined] - file.write = write_with_prefix # type: ignore[method-assign] - - -def decorate_logs(process_name: Optional[str] = None) -> None: - """ - Adds a process-specific prefix to each line of output written to stdout and - stderr. - - This function is intended to be called before initializing the api_server, - engine_core, or worker classes, so that all subsequent output from the - process is prefixed with the process name and PID. This helps distinguish - log output from different processes in multi-process environments. - - Args: - process_name: Optional; the name of the process to use in the prefix. - If not provided, the current process name from the multiprocessing - context is used. - """ - if process_name is None: - process_name = get_mp_context().current_process().name - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 0edfb01cde9d6..169b083017e46 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -8,25 +8,13 @@ from __future__ import annotations import functools import importlib -import os from typing import Any, Callable, NoReturn import torch import vllm.envs as envs from vllm.platforms import current_platform -from vllm.utils import cdiv, has_deep_gemm - - -@functools.cache -def is_deep_gemm_supported() -> bool: - """Return ``True`` if DeepGEMM is supported on the current platform. - Currently, only Hopper and Blackwell GPUs are supported. - """ - is_supported_arch = current_platform.is_cuda() and ( - current_platform.is_device_capability(90) - or current_platform.is_device_capability(100)) - return has_deep_gemm() and is_supported_arch +from vllm.utils import has_deep_gemm @functools.cache @@ -38,7 +26,7 @@ def is_blackwell_deep_gemm_used() -> bool: return False _lazy_init() - if _fp8_gemm_nt_impl is None: + if _per_block_cast_impl is None: return False return (current_platform.is_cuda() @@ -64,26 +52,23 @@ def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None: _fp8_gemm_nt_impl: Callable[..., Any] | None = None _grouped_impl: Callable[..., Any] | None = None _grouped_masked_impl: Callable[..., Any] | None = None +_per_block_cast_impl: Callable[..., Any] | None = None def _lazy_init() -> None: """Import deep_gemm and resolve symbols on first use.""" - global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl + global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl, \ + _per_block_cast_impl # fast path if (_fp8_gemm_nt_impl is not None or _grouped_impl is not None - or _grouped_masked_impl is not None): + or _grouped_masked_impl is not None + or _per_block_cast_impl is not None): return if not has_deep_gemm(): return - # Set up deep_gemm cache path - DEEP_GEMM_JIT_CACHE_ENV_NAME = 'DG_JIT_CACHE_DIR' - if not os.environ.get(DEEP_GEMM_JIT_CACHE_ENV_NAME, None): - os.environ[DEEP_GEMM_JIT_CACHE_ENV_NAME] = os.path.join( - envs.VLLM_CACHE_ROOT, "deep_gemm") - _dg = importlib.import_module("deep_gemm") _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt", @@ -94,6 +79,14 @@ def _lazy_init() -> None: _grouped_masked_impl = _resolve_symbol( _dg, "fp8_m_grouped_gemm_nt_masked", "m_grouped_gemm_fp8_fp8_bf16_nt_masked") + # Try to get per_token_cast_to_fp8 from DeepGEMM math utils. + try: + _math_mod = importlib.import_module( + "deep_gemm.utils.math") # type: ignore + _per_block_cast_impl = getattr(_math_mod, "per_block_cast_to_fp8", + None) + except ModuleNotFoundError: + _per_block_cast_impl = None def fp8_gemm_nt(*args, **kwargs): @@ -117,37 +110,13 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs): return _grouped_masked_impl(*args, **kwargs) -def _ceil_to_ue8m0(x: torch.Tensor): - return torch.pow(2.0, torch.ceil(torch.log2(x.abs()))) - - -def _align(x: int, y: int) -> int: - return cdiv(x, y) * y - - -DEFAULT_BLOCK_SIZE = [128, 128] - - -# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/dd6ed14acbc7445dcef224248a77ab4d22b5f240/deep_gemm/utils/math.py#L38 -# TODO(wentao): optimize this function, using triton or cuda kernel -def per_block_cast_to_fp8( - x: torch.Tensor, - block_size: list[int] = DEFAULT_BLOCK_SIZE, - use_ue8m0: bool = False) -> tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - block_m, block_n = block_size - x_padded = torch.zeros((_align(m, block_m), _align(n, block_n)), - dtype=x.dtype, - device=x.device) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - sf = x_amax / 448.0 - sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf - x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn) - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view( - x_view.size(0), x_view.size(2)) +def per_block_cast_to_fp8(x, *args, **kwargs): + _lazy_init() + if _per_block_cast_impl is not None and is_blackwell_deep_gemm_used(): + return _per_block_cast_impl(x, use_ue8m0=True) + # TODO: refactor the `per_block_cast_to_fp8` from tests to vllm utils + from tests.kernels.quant_utils import per_block_cast_to_fp8 as _pbcf + return _pbcf(x, *args, **kwargs) def calc_diff(x: torch.Tensor, y: torch.Tensor): @@ -173,5 +142,4 @@ __all__ = [ "fp8_m_grouped_gemm_nt_masked", "per_block_cast_to_fp8", "is_blackwell_deep_gemm_used", - "is_deep_gemm_supported", ] diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 29967bc516715..3bfb9808c0a00 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -10,25 +10,12 @@ import contextlib import functools import importlib import importlib.util -import os -from typing import Any, Callable, NoReturn, Optional +from typing import Any, Callable, NoReturn -import requests - -import vllm.envs as envs from vllm.logger import init_logger -from vllm.platforms import current_platform logger = init_logger(__name__) -# This is the storage path for the cubins, it can be replaced -# with a local path for testing. -# Referenced from https://github.com/flashinfer-ai/flashinfer/blob/0c9a92c3d9a7e043ab6f3f7b2273269caf6ab044/flashinfer/jit/cubin_loader.py#L35 # noqa: E501 -FLASHINFER_CUBINS_REPOSITORY = os.environ.get( - "FLASHINFER_CUBINS_REPOSITORY", - "https://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/", # noqa: E501 -) - @functools.cache def has_flashinfer() -> bool: @@ -121,70 +108,6 @@ def has_flashinfer_cutlass_fused_moe() -> bool: return True -@functools.cache -def has_nvidia_artifactory() -> bool: - """Return ``True`` if NVIDIA's artifactory is accessible. - - This checks connectivity to the kernel inference library artifactory - which is required for downloading certain cubin kernels like TRTLLM FHMA. - """ - try: - # Use a short timeout to avoid blocking for too long - response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5) - accessible = response.status_code == 200 - if accessible: - logger.debug_once("NVIDIA artifactory is accessible") - else: - logger.warning_once( - "NVIDIA artifactory returned failed status code: %d", - response.status_code) - return accessible - except Exception as e: - logger.warning_once("Failed to connect to NVIDIA artifactory: %s", e) - return False - - -def use_trtllm_decode_attention( - num_tokens: int, - max_seq_len: int, - kv_cache_dtype: str, - num_qo_heads: Optional[int], - num_kv_heads: Optional[int], - attn_head_size: Optional[int], -) -> bool: - # Requires SM100 and NVIDIA artifactory to be accessible to download cubins - if not (current_platform.is_device_capability(100) - and has_nvidia_artifactory()): - return False - - # Check if the dimensions are supported by TRTLLM decode attention - if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None - or num_qo_heads // num_kv_heads > 8 - or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): - return False - - env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION - if env_value is not None: - logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", - env_value) - # Environment variable is set - respect it - # Making the conditional check for zero because - # the path is automatically enabled if the batch size condition - # is satisfied. - no_use_trtllm = (env_value == "0") - if not no_use_trtllm: - logger.info_once("Using TRTLLM decode attention.") - return not no_use_trtllm - else: - # Environment variable not set - use auto-detection - use_trtllm = (num_tokens <= 256 and max_seq_len < 131072 - and kv_cache_dtype == "auto") - if use_trtllm: - logger.warning_once( - "Using TRTLLM decode attention (auto-detected).") - return use_trtllm - - __all__ = [ "has_flashinfer", "flashinfer_trtllm_fp8_block_scale_moe", @@ -194,6 +117,4 @@ __all__ = [ "autotune", "has_flashinfer_moe", "has_flashinfer_cutlass_fused_moe", - "has_nvidia_artifactory", - "use_trtllm_decode_attention", ] diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index f086bab2556eb..4c2a6c6b985b2 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -25,8 +25,7 @@ if is_flash_attn_varlen_func_available(): from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.utils import cdiv -from vllm.v1.attention.backends.utils import (AttentionCGSupport, - AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout) from vllm.v1.kv_cache_interface import AttentionSpec @@ -100,13 +99,6 @@ class FlashAttentionBackend(AttentionBackend): raise ValueError(f"Unknown cache layout format {cache_layout}.") return stride_order - @staticmethod - def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype: - if kv_cache_dtype in ("fp8", "fp8_e4m3"): - return torch.float8_e4m3fn - else: - raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") - @dataclass class FlashAttentionMetadata: @@ -154,9 +146,7 @@ def _get_sliding_window_configs( class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.NEVER if get_flash_attn_version() == 2 \ - else AttentionCGSupport.ALWAYS + full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3 def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): @@ -171,7 +161,6 @@ class FlashAttentionMetadataBuilder( self.parallel_config) self.num_heads_kv = self.model_config.get_num_kv_heads( self.parallel_config) - self.kv_cache_dtype = kv_cache_spec.dtype self.headdim = self.model_config.get_head_size() self.block_size = kv_cache_spec.block_size @@ -250,24 +239,17 @@ class FlashAttentionMetadataBuilder( def schedule(batch_size, cu_query_lens, max_query_len, seqlens, max_seq_len, causal): - cache_dtype = self.cache_config.cache_dtype - if cache_dtype.startswith("fp8"): - qkv_dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn( - cache_dtype) - else: - qkv_dtype = self.kv_cache_dtype if aot_schedule: return get_scheduler_metadata( batch_size=batch_size, max_seqlen_q=max_query_len, max_seqlen_k=max_seq_len, + cache_seqlens=seqlens, num_heads_q=self.num_heads_q, num_heads_kv=self.num_heads_kv, headdim=self.headdim, - cache_seqlens=seqlens, - qkv_dtype=qkv_dtype, - cu_seqlens_q=cu_query_lens, page_size=self.block_size, + cu_seqlens_q=cu_query_lens, causal=causal, window_size=self.aot_sliding_window, num_splits=self.max_num_splits, @@ -492,10 +474,8 @@ class FlashAttentionImpl(AttentionImpl): ) if self.kv_cache_dtype.startswith("fp8"): - dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn( - self.kv_cache_dtype) - key_cache = key_cache.view(dtype) - value_cache = value_cache.view(dtype) + key_cache = key_cache.view(torch.float8_e4m3fn) + value_cache = value_cache.view(torch.float8_e4m3fn) num_tokens, num_heads, head_size = query.shape query, _ = ops.scaled_fp8_quant( query.reshape( diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 0aaad02b5b840..27552f0e7c1ef 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -4,28 +4,26 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Optional, Union +from typing import TYPE_CHECKING, Optional import torch from flashinfer import (BatchDecodeWithPagedKVCacheWrapper, BatchPrefillWithPagedKVCacheWrapper, MultiLevelCascadeAttentionWrapper) -from flashinfer.decode import (_get_range_buf, get_seq_lens, - trtllm_batch_decode_with_kv_cache) +from flashinfer.decode import trtllm_batch_decode_with_kv_cache import vllm.envs as envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionType) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv, is_pin_memory_available -from vllm.utils.flashinfer import use_trtllm_decode_attention +from vllm.platforms import current_platform +from vllm.utils import cdiv from vllm.v1.attention.backends.flash_attn import use_cascade_attention from vllm.v1.attention.backends.utils import ( - AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, - get_kv_cache_layout, get_per_layer_parameters, - infer_global_hyperparameters, reorder_batch_to_split_decodes_and_prefills, - split_decodes_and_prefills) + AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout, + get_per_layer_parameters, infer_global_hyperparameters, + reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec if TYPE_CHECKING: @@ -40,6 +38,7 @@ logger = init_logger(__name__) class FlashInferBackend(AttentionBackend): accept_output_buffer: bool = True + cached_sm100a_supported: Optional[bool] = None @classmethod def get_supported_dtypes(cls) -> list[torch.dtype]: @@ -99,6 +98,48 @@ class FlashInferBackend(AttentionBackend): raise ValueError(f"Unknown cache layout format {cache_layout}.") return stride_order + @staticmethod + def use_trtllm_decode_attention( + batch_size: int, + max_seq_len: int, + kv_cache_dtype: str, + num_qo_heads: int, + num_kv_heads: int, + attn_head_size: int, + ) -> bool: + if FlashInferBackend.cached_sm100a_supported is None: + FlashInferBackend.cached_sm100a_supported = ( + current_platform.has_device_capability(100)) + if not FlashInferBackend.cached_sm100a_supported: + return False + if (num_qo_heads // num_kv_heads > 8 + or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): + return False + env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION + if env_value is not None: + logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", + env_value) + # Environment variable is set - respect it + # Making the conditional check for zero because + # the path is automatically enabled if the batch size condition + # is satisfied. + no_use_trtllm = env_value == "0" + if not no_use_trtllm: + logger.info_once( + "VLLM_USE_TRTLLM_DECODE_ATTENTION is set to 1, " + "using TRTLLM decode attention.") + return not no_use_trtllm + else: + # Environment variable not set - use auto-detection + # Only supports attention head size of 128 + use_trtllm = (FlashInferBackend.cached_sm100a_supported + and batch_size <= 256 and max_seq_len < 131072 + and kv_cache_dtype == "auto") + if use_trtllm: + logger.warning_once( + "Using TRTLLM decode attention (auto-detected).") + return use_trtllm + @staticmethod def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype: if kv_cache_dtype in ("fp8", "fp8_e4m3"): @@ -176,66 +217,26 @@ class FlashInferMetadata: class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.PURE_DECODE_ONLY def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): self.device = device - self.vllm_config = vllm_config - self.cache_config = vllm_config.cache_config - self.kv_cache_spec = kv_cache_spec self._workspace_buffer = None self._prefill_wrapper = None # Wrapper for prefill/append - self._decode_wrapper = None # Wrapper for decode (general shape) - - self.compilation_config = vllm_config.compilation_config - max_num_pages_per_req = cdiv(vllm_config.model_config.max_model_len, - self.kv_cache_spec.block_size) - max_num_reqs = vllm_config.scheduler_config.max_num_seqs - max_num_pages = max_num_reqs * max_num_pages_per_req - self.enable_cuda_graph = self.compilation_config.full_cuda_graph - if self.enable_cuda_graph: - # For full cudagraph capture, one `decode_wrapper` for each batch - # size is needed for FlashInfer. - self._decode_wrappers_cudagraph: dict[ - int, BatchDecodeWithPagedKVCacheWrapper] = {} - self._decode_cudagraph_max_bs = min( - max_num_reqs, self.compilation_config.max_capture_size) - + self._decode_wrapper = None # Wrapper for decode self._cascade_wrapper = None # Wrapper for cascade attention # Global hyperparameters shared by all attention layers self.global_hyperparameters = infer_global_hyperparameters( get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl)) - # Preparing persistent buffers (device-side) - self.paged_kv_indptr = torch.zeros(max_num_reqs + 1, - dtype=torch.int32, - device=self.device) - self.paged_kv_indices = torch.zeros( - max_num_pages, # max num pages possible - dtype=torch.int32, - device=self.device) - self.paged_kv_last_page_len = torch.zeros(max_num_reqs, - dtype=torch.int32, - device=self.device) - # host-side buffer - pin_memory = is_pin_memory_available() - self.paged_kv_indptr_cpu = torch.zeros(max_num_reqs + 1, - dtype=torch.int32, - device="cpu", - pin_memory=pin_memory) - self.paged_kv_indices_cpu = torch.zeros(max_num_pages, - dtype=torch.int32, - device="cpu", - pin_memory=pin_memory) - self.paged_kv_last_page_len_cpu = torch.zeros(max_num_reqs, - dtype=torch.int32, - device="cpu", - pin_memory=pin_memory) - - self.block_table_arange = torch.arange(max_num_pages_per_req, + self.vllm_config = vllm_config + self.cache_config = vllm_config.cache_config + self.kv_cache_spec = kv_cache_spec + max_num_blocks_per_request = cdiv( + vllm_config.model_config.max_model_len, + self.kv_cache_spec.block_size) + self.block_table_arange = torch.arange(max_num_blocks_per_request, dtype=torch.int32, device=self.device) @@ -259,16 +260,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self._get_workspace_buffer(), get_kv_cache_layout()) return self._prefill_wrapper - def _get_decode_wrapper(self, - batch_size: int, - use_cudagraph: bool = False): - if use_cudagraph: - decode_wrapper = self._decode_wrappers_cudagraph.get( - batch_size, None) - else: - decode_wrapper = self._decode_wrapper - - if decode_wrapper is None: + def _get_decode_wrapper(self): + if self._decode_wrapper is None: num_qo_heads = ( self.vllm_config.model_config.get_num_attention_heads( self.vllm_config.parallel_config)) @@ -276,32 +269,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.vllm_config.parallel_config) use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or ( num_qo_heads // num_kv_heads > 4) - - if use_cudagraph: - paged_kv_indptr = self.paged_kv_indptr[:batch_size + 1] - paged_kv_indices = self.paged_kv_indices - paged_kv_last_page_len = self.paged_kv_last_page_len[: - batch_size] - else: - paged_kv_indptr = None - paged_kv_indices = None - paged_kv_last_page_len = None - decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( + self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( self._get_workspace_buffer(), get_kv_cache_layout(), - use_cuda_graph=use_cudagraph, - paged_kv_indptr_buffer=paged_kv_indptr, - paged_kv_indices_buffer=paged_kv_indices, - paged_kv_last_page_len_buffer=paged_kv_last_page_len, use_tensor_cores=use_tensor_cores) - - # save the decode wrapper - if use_cudagraph: - self._decode_wrappers_cudagraph[batch_size] = decode_wrapper - else: - self._decode_wrapper = decode_wrapper - - return decode_wrapper + return self._decode_wrapper def _get_cascade_wrapper(self): if self._cascade_wrapper is None: @@ -379,44 +351,16 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) if num_decodes > 0: - pure_decode = num_prefills == 0 - # possible required padding for cudagraph replay - use_cudagraph = (self.enable_cuda_graph and pure_decode and - num_decodes <= self._decode_cudagraph_max_bs) - if use_cudagraph: - num_input_tokens = ( - self.vllm_config.pad_for_cudagraph(num_decodes)) - # Carefully fulfill the padding region with reasonable value - # on cpu. - # Make sure paged_kv_indptr_cpu is not decreasing - self.paged_kv_indptr_cpu[1 + num_decodes:1 + - num_input_tokens].fill_( - attn_metadata. - paged_kv_indptr_cpu[-1]) - # Fill the remaining paged_kv_last_page_len_cpu with 1. - # This is because flashinfer treats 0 as a full page - # instead of empty. - self.paged_kv_last_page_len_cpu[ - num_decodes:num_input_tokens].fill_(1) - - else: - num_input_tokens = num_decodes - - attn_metadata.decode_wrapper = self._get_decode_wrapper( - num_input_tokens, use_cudagraph) - if not use_trtllm_decode_attention( + attn_metadata.decode_wrapper = self._get_decode_wrapper() + if not FlashInferBackend.use_trtllm_decode_attention( num_decodes, attn_metadata.max_seq_len, self.cache_config.cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): - # Use the persistent buffer with padding length, - # instead of the same address but chunked version - # in atten_metadata when using cudagraph. - fast_plan_decode( - attn_metadata.decode_wrapper, - self.paged_kv_indptr_cpu[:num_input_tokens + 1], + attn_metadata.decode_wrapper.plan( + attn_metadata.paged_kv_indptr_cpu[:num_decodes + 1], attn_metadata.paged_kv_indices, - self.paged_kv_last_page_len_cpu[:num_input_tokens], + attn_metadata.paged_kv_last_page_len_cpu[:num_decodes], attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim, @@ -435,7 +379,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, fast_build: bool = False) -> FlashInferMetadata: - num_reqs = common_attn_metadata.num_reqs num_actual_tokens = common_attn_metadata.num_actual_tokens num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\ split_decodes_and_prefills(common_attn_metadata) @@ -481,26 +424,18 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): non_blocking=True) mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0) < block_table_bounds.unsqueeze(1)) - # write self.paged_kv_indices inplace - num_actual_pages = torch.sum(mask) - paged_kv_indices = self.paged_kv_indices[:num_actual_pages] - torch.masked_select(block_table_tensor[:, :max_num_blocks], - mask, - out=paged_kv_indices) + paged_kv_indices = block_table_tensor[:, :max_num_blocks][mask] - # write self.paged_kv_indptr_cpu inplace (0-index is always 0) - torch.cumsum(block_table_bounds_cpu, - dim=0, - dtype=torch.int32, - out=self.paged_kv_indptr_cpu[1:1 + num_reqs]) + paged_kv_indptr_cpu = torch.zeros(len(block_table_bounds_cpu) + 1, + dtype=torch.int32, + device='cpu') + paged_kv_indptr_cpu[1:] = block_table_bounds_cpu.cumsum( + dim=0, dtype=torch.int32) paged_kv_last_page_len_cpu = seq_lens_cpu % page_size - # write self.paged_kv_last_page_len_cpu inplace - torch.where(paged_kv_last_page_len_cpu == 0, - torch.tensor(page_size), - paged_kv_last_page_len_cpu, - out=self.paged_kv_last_page_len_cpu[:num_reqs]) - + paged_kv_last_page_len_cpu = torch.where( + paged_kv_last_page_len_cpu == 0, page_size, + paged_kv_last_page_len_cpu) cache_dtype = self.cache_config.cache_dtype if cache_dtype.startswith("fp8"): kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer( @@ -510,10 +445,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu, - paged_kv_indptr_cpu=self.paged_kv_indptr_cpu[:1 + num_reqs], + paged_kv_indptr_cpu=paged_kv_indptr_cpu, paged_kv_indices=paged_kv_indices, - paged_kv_last_page_len_cpu=self. - paged_kv_last_page_len_cpu[:num_reqs], + paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu, num_qo_heads=self.vllm_config.model_config.get_num_attention_heads( self.vllm_config.parallel_config), num_kv_heads=self.kv_cache_spec.num_kv_heads, @@ -540,26 +474,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): return attn_metadata - def build_for_cudagraph_capture( - self, common_attn_metadata: CommonAttentionMetadata): - """ - This method builds the metadata for full cudagraph capture. - Currently, only decode is supported for full cudagraphs with FlashInfer. - """ - m = common_attn_metadata - - assert m.num_reqs == m.num_actual_tokens, \ - "FlashInfer only supports decode-only full CUDAGraph capture. " \ - "Make sure all cudagraph capture sizes <= max_num_seq." - - m.max_query_len = 1 # decode-only - - return self.build(0, m) - - def can_run_in_cudagraph( - self, common_attn_metadata: CommonAttentionMetadata) -> bool: - return common_attn_metadata.max_query_len == 1 - def use_cascade_attention(self, *args, **kwargs) -> bool: if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype: # TODO: The cascade wrapper currently does not support setting @@ -722,7 +636,7 @@ class FlashInferImpl(AttentionImpl): decode_query = query[:num_decode_tokens] assert decode_query.shape[0] == num_decode_tokens assert decode_wrapper is not None - if not use_trtllm_decode_attention( + if not FlashInferBackend.use_trtllm_decode_attention( attn_metadata.num_decodes, attn_metadata.max_seq_len, self.kv_cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): @@ -767,163 +681,3 @@ class FlashInferImpl(AttentionImpl): out=output[:num_decode_tokens], ) return output_padded - - -def fast_plan_decode( - self, # decode wrapper - indptr_cpu: torch.Tensor, - indices: torch.Tensor, - last_page_len_cpu: torch.Tensor, - num_qo_heads: int, - num_kv_heads: int, - head_dim: int, - page_size: int, - pos_encoding_mode: str = "NONE", - window_left: int = -1, - logits_soft_cap: Optional[float] = None, - q_data_type: Optional[Union[str, torch.dtype]] = "float16", - kv_data_type: Optional[Union[str, torch.dtype]] = None, - data_type: Optional[Union[str, torch.dtype]] = None, - sm_scale: Optional[float] = None, - rope_scale: Optional[float] = None, - rope_theta: Optional[float] = None, - non_blocking: bool = True, -) -> None: - """ - A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for - cudagraph capture/replay, while the no cudagraph version turns back - to the original plan. - using original plan after passing host-side buffers: - - only host-to-device copy of indptr and last_page_len buffers - Modifications for cudagraph: - - only host-to-device copy of indptr and last_page_len buffers. - - avoid device-to-device copy of indices buffer. - - Part of the code get inspiration from the original plan from FlashInfer repo - and the implementation of fast_decode_plan for FlashInfer in SGlang repo. - """ - # Warm up with the original plan if it is first call, and always run the - # original plan if we run for dynamic shape. For fixed shape (cudagraph), - # this warm up is to generate the _cached_module for the decode wrapper. - if not self.is_cuda_graph_enabled or \ - getattr(self, "vllm_first_call", True): - self.plan( - indptr_cpu, - indices, - last_page_len_cpu, - num_qo_heads, - num_kv_heads, - head_dim, - page_size, - pos_encoding_mode, - window_left, - logits_soft_cap, - q_data_type, - kv_data_type, - data_type, - sm_scale, - rope_scale, - rope_theta, - non_blocking, - ) - self.vllm_first_call = False - return - - assert self.is_cuda_graph_enabled, "Should be cudagraph only here" - - batch_size = len(last_page_len_cpu) - if logits_soft_cap is None: - logits_soft_cap = 0.0 - - # Handle data types consistently - if data_type is not None: - if q_data_type is None: - q_data_type = data_type - if kv_data_type is None: - kv_data_type = data_type - elif q_data_type is None: - q_data_type = "float16" - - if kv_data_type is None: - kv_data_type = q_data_type - q_data_type = getattr(torch, q_data_type) if isinstance( - q_data_type, str) else q_data_type - kv_data_type = getattr(torch, kv_data_type) if isinstance( - kv_data_type, str) else kv_data_type - - if self.use_tensor_cores: - qo_indptr_host = _get_range_buf(batch_size + 1, "cpu") - - if batch_size != self._fixed_batch_size: - raise ValueError( - "The batch size should be fixed in cudagraph mode, the runtime " - "batch size {} mismatches the batch size set during " - "initialization {}".format(batch_size, self._fixed_batch_size)) - if len(indices) > len(self._paged_kv_indices_buf): - raise ValueError( - "The size of indices should be less than or equal to the " - "allocated buffer") - - # host-to-device copy for the indptr buffer - self._paged_kv_indptr_buf.copy_(indptr_cpu, non_blocking=True) - # host-to-device copy for the last_page_len buffer - self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu, - non_blocking=True) - - indptr_host = indptr_cpu - last_page_len_host = last_page_len_cpu - - if self.use_tensor_cores: - kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host, - page_size) - - try: - # Make sure we pass exactly 15 arguments for tensor core version - self._plan_info = self._cached_module.plan( - self._float_workspace_buffer, - self._int_workspace_buffer, - self._pin_memory_int_workspace_buffer, - qo_indptr_host, - indptr_host, - kv_lens_arr_host, - batch_size, # total_num_rows - batch_size, - num_qo_heads, - num_kv_heads, - page_size, - self.is_cuda_graph_enabled, - head_dim, - head_dim, - False, # causal - ) - except Exception as e: - raise RuntimeError(f"Error in tensor core plan: {e}") from e - else: - try: - # Make sure we pass exactly 15 arguments for standard version - self._plan_info = self._cached_module.plan( - self._float_workspace_buffer, - self._int_workspace_buffer, - self._pin_memory_int_workspace_buffer, - indptr_host, - batch_size, - num_qo_heads, - num_kv_heads, - page_size, - self.is_cuda_graph_enabled, - window_left, - logits_soft_cap, - head_dim, - head_dim, - torch.empty(0, dtype=q_data_type), - torch.empty(0, dtype=kv_data_type), - ) - except Exception as e: - raise RuntimeError(f"Error in standard plan: {e}") from e - - self._pos_encoding_mode = pos_encoding_mode - self._window_left = window_left - self._logits_soft_cap = logits_soft_cap - self._sm_scale = sm_scale - self._rope_scale = rope_scale - self._rope_theta = rope_theta diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index ebd9565f3f760..0b066de449371 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -209,7 +209,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, UnquantizedLinearMethod) from vllm.platforms import current_platform from vllm.utils import cdiv, round_down -from vllm.utils.flashinfer import has_nvidia_artifactory from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, get_per_layer_parameters, infer_global_hyperparameters, @@ -380,16 +379,17 @@ M = TypeVar("M", bound=MLACommonMetadata) def use_flashinfer_prefill() -> bool: - # For blackwell default to flashinfer prefill if its available since - # it is faster than FA2. - return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL - and current_platform.is_device_capability(100)) + if flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL: + # For blackwell default to flashinfer prefill if its available since + # its faster than FA2. + return current_platform.has_device_capability(100) + return False def use_cudnn_prefill() -> bool: - return (flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL - and current_platform.is_device_capability(100) - and has_nvidia_artifactory()) + if flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL: + return current_platform.has_device_capability(100) + return False # Currently 394MB, this can be tuned based on GEMM sizes used. @@ -444,8 +444,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): # 2*(192*128)*(64*1024) = 3gb # (assuming 192 QK head dim, 128 heads, and fp16) 128 * 1024) - self.chunked_prefill_workspace_size = \ - scheduler_config.max_num_seqs * cache_config.block_size + self.chunked_prefill_workspace_size = scheduler_config.max_num_seqs * cache_config.block_size assert self.chunked_prefill_workspace_size >= \ scheduler_config.max_num_seqs * cache_config.block_size self.chunked_prefill_workspace = torch.empty( @@ -564,8 +563,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): def reorder_batch(self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput") -> bool: return reorder_batch_to_split_decodes_and_prefills(input_batch, - scheduler_output, - decode_threshold=1) + scheduler_output, + decode_threshold=1) def _build_decode(self, block_table_tensor: torch.Tensor, seq_lens: torch.Tensor): diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index b078c183a251d..290dd252a1578 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -18,7 +18,6 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder) -from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) @@ -55,8 +54,7 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]): class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.PURE_DECODE_ONLY + full_cudagraph_supported: ClassVar[bool] = True # Decode-only def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): @@ -81,7 +79,7 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): ) if self.compilation_config.full_cuda_graph: - # if False: + # if False: n = num_splits.size(0) # First time around (CUDAGraph capture), allocate the static buffer if self.cg_buf_num_splits is None: @@ -100,8 +98,7 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): # Num splits is per-batch, varying size (batch_size,) n = num_splits.size(0) - # logger.info( - # f"N: {n} num splits {self.cg_buf_num_splits.size(0)}") + # logger.info(f"N: {n} num splits {self.cg_buf_num_splits.size(0)}") # make sure static buffer is large enough assert n <= self.cg_buf_num_splits.size(0) num_splits_view = self.cg_buf_num_splits[:n] diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 8b55e1a301992..5c5891f035ae2 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -17,7 +17,6 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder) -from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec # yapf: enable @@ -65,8 +64,7 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.PURE_DECODE_ONLY + full_cudagraph_supported: ClassVar[bool] = True # decode only def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 942cb95eefa2f..195fbd3b1b9c4 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -18,8 +18,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.v1.attention.backends.utils import (AttentionCGSupport, - AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec @@ -58,8 +57,7 @@ class TritonAttentionMetadata: class TritonAttentionMetadataBuilder( AttentionMetadataBuilder[TritonAttentionMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.ALWAYS + full_cudagraph_supported: ClassVar[bool] = True def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 68c51340a2ac4..93f45c8b65093 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import abc -import enum import functools from abc import abstractmethod from dataclasses import dataclass, make_dataclass @@ -148,24 +147,9 @@ def split_attn_metadata( M = TypeVar("M") -class AttentionCGSupport(enum.Enum): - """ Constants for the cudagraph support of the attention backend - Here we do not consider the cascade attention, as currently - it is never cudagraph supported.""" - - NEVER = 0 - """NO cudagraph support""" - PURE_DECODE_ONLY = 1 - """Cudagraph supported for pure decode, need to run without - cudagraph for mixed prefill-decode batches""" - ALWAYS = 2 - """Cudagraph always supported""" - - class AttentionMetadataBuilder(abc.ABC, Generic[M]): # Does this backend/builder support CUDA Graphs for attention. - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.NEVER + full_cudagraph_supported: ClassVar[bool] = False @abstractmethod def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index eab1560b1a18c..25520eb655111 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -567,10 +567,12 @@ def hash_request_tokens(hash_function: Any, block_size: int, ret = [] parent_block_hash_value = None - # Only full blocks will be hashed - for start in range(0, len(token_ids) - block_size + 1, block_size): + for start in range(0, len(token_ids), block_size): end = start + block_size block_token_ids = token_ids[start:end] + # Do not hash the block if it is not full. + if len(block_token_ids) < block_size: + break if req_need_extra_keys: # MM and LoRA requests need extra keys for block-hash computation. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 45f450291ab63..ed0d9620f4762 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -27,7 +27,7 @@ from vllm.transformers_utils.config import ( from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, cdiv, deprecate_kwargs +from vllm.utils import Device, cdiv from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError @@ -57,7 +57,6 @@ class AsyncLLM(EngineClient): start_engine_loop: bool = True, stat_loggers: Optional[list[StatLoggerFactory]] = None, client_addresses: Optional[dict[str, str]] = None, - client_count: int = 1, client_index: int = 0, ) -> None: """ @@ -121,7 +120,6 @@ class AsyncLLM(EngineClient): executor_class=executor_class, log_stats=self.log_stats, client_addresses=client_addresses, - client_count=client_count, client_index=client_index, ) @@ -144,23 +142,16 @@ class AsyncLLM(EngineClient): pass @classmethod - @deprecate_kwargs( - "disable_log_requests", - additional_message=("This argument will have no effect. " - "Use `enable_log_requests` instead."), - ) def from_vllm_config( - cls, - vllm_config: VllmConfig, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[list[StatLoggerFactory]] = None, - enable_log_requests: bool = False, - disable_log_stats: bool = False, - client_addresses: Optional[dict[str, str]] = None, - client_count: int = 1, - client_index: int = 0, - disable_log_requests: bool = True, # Deprecated, will be removed + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[list[StatLoggerFactory]] = None, + disable_log_requests: bool = False, + disable_log_stats: bool = False, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0, ) -> "AsyncLLM": if not envs.VLLM_USE_V1: raise ValueError( @@ -175,11 +166,10 @@ class AsyncLLM(EngineClient): executor_class=Executor.get_class(vllm_config), start_engine_loop=start_engine_loop, stat_loggers=stat_loggers, - log_requests=enable_log_requests, + log_requests=not disable_log_requests, log_stats=not disable_log_stats, usage_context=usage_context, client_addresses=client_addresses, - client_count=client_count, client_index=client_index, ) @@ -201,7 +191,7 @@ class AsyncLLM(EngineClient): return cls( vllm_config=vllm_config, executor_class=executor_class, - log_requests=engine_args.enable_log_requests, + log_requests=not engine_args.disable_log_requests, log_stats=not engine_args.disable_log_stats, start_engine_loop=start_engine_loop, usage_context=usage_context, diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index 596edfdbe24f8..440628576bcb7 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import copy import multiprocessing import time import weakref @@ -66,14 +65,18 @@ class DPCoordinator: # Assume coordinator is colocated with front-end procs when not in # either external or hybrid DP LB mode. - local_only = not (external_lb or hybrid_lb) front_publish_address = get_engine_client_zmq_addr( - local_only=local_only, host=host) + local_only=not external_lb and not hybrid_lb, host=host) local_only_eng = dp_size == parallel_config.data_parallel_size_local back_publish_address = get_engine_client_zmq_addr(local_only_eng, host) back_output_address = get_engine_client_zmq_addr(local_only_eng, host) + # When in external LB mode, load stats aren't published, only changes + # to request wave / running state, so we don't need to rate-limit the + # updates to the front-end proc(s). + min_stats_update_interval_ms = 0 if external_lb else 100 + context = get_mp_context() self.proc: multiprocessing.Process = context.Process( target=DPCoordinatorProc.run_coordinator, @@ -83,6 +86,7 @@ class DPCoordinator: "front_publish_address": front_publish_address, "back_output_address": back_output_address, "back_publish_address": back_publish_address, + "min_stats_update_interval_ms": min_stats_update_interval_ms, }, daemon=True) self.proc.start() @@ -121,6 +125,10 @@ class DPCoordinatorProc: self.stats_update_interval_ms = min_stats_update_interval_ms + self.current_wave = 0 + self.engines_running = False + self.stats_changed = False + @staticmethod def run_coordinator( engine_count: int, @@ -147,16 +155,6 @@ class DPCoordinatorProc: decoder = MsgpackDecoder(EngineCoreOutputs) - # For tracking request wave progression. - current_wave = 0 - engines_running = False - - # For tracking request counts for internal load-balancing. - stats_changed = False - last_stats_step = -1 - last_stats_wave = -1 - last_step_counts: Optional[list[list[int]]] = None - with make_zmq_socket( path=front_publish_address, # IPC ctx=self.ctx, @@ -174,18 +172,6 @@ class DPCoordinatorProc: bind=True, ) as publish_back: - # Wait until all engines subscribe. - for _ in self.engines: - if publish_back.recv() != b'\x01': - logger.error( - "DP Coordinator received unexpected message while " - "waiting for engines to subscribe") - return - # Send ready message to engines. - publish_back.send(b"READY") - - logger.info("All engine subscriptions received by DP coordinator") - poller = zmq.Poller() poller.register(publish_front, zmq.POLLIN) poller.register(output_back, zmq.POLLIN) @@ -193,33 +179,21 @@ class DPCoordinatorProc: while True: elapsed = int(time.time() * 1000) - last_publish_time # Send at stats_update_interval_ms interval if the stats have - # changed, or otherwise every 5 seconds. + # changed, or otherwise every 4 seconds. wait_for = (self.stats_update_interval_ms - if stats_changed else 5000) - - # Wait at least 50ms to ensure we've received all stats for - # the current step. - min_timeout = 50 if last_step_counts is None else 0 - - events = poller.poll(timeout=max(min_timeout, wait_for - - elapsed)) + if self.stats_changed else 4000) + events = poller.poll(timeout=max(0, wait_for - elapsed)) if not events: # Poller timeout - publish current stats to front-ends. - if last_step_counts is not None: - engine_req_counts_list = last_step_counts - last_step_counts = None - else: - engine_req_counts_list = self._get_engine_counts() - stats_changed = False - - to_publish = (engine_req_counts_list, current_wave, - engines_running) + engine_req_counts_list = self._get_engine_counts() + to_publish = (engine_req_counts_list, self.current_wave, + self.engines_running) publish_front.send(msgspec.msgpack.encode(to_publish)) last_publish_time = int(time.time() * 1000) + self.stats_changed = False continue events = dict(events) - wave_state_changed = False if publish_front in events: buffer = publish_front.recv() @@ -246,7 +220,7 @@ class DPCoordinatorProc: # current_wave # we note that 0 is the wave number for the new # engine - engines_running = False + self.engines_running = False logger.info( "DPCoordinator scaled up from %s to %s " "engines", current_count, new_engine_count) @@ -262,15 +236,15 @@ class DPCoordinatorProc: # engines are paused, so that we can wake the other # engines. engine_to_exclude, wave = decoded - if not engines_running: - if wave < current_wave: + if not self.engines_running: + if wave < self.current_wave: # If the wave number is stale, ensure the message # is handled by all the engines. engine_to_exclude = None - engines_running = True - wave_state_changed = True - self._send_start_wave(publish_back, current_wave, + self.engines_running = True + self.stats_changed = True + self._send_start_wave(publish_back, self.current_wave, engine_to_exclude) if output_back in events: @@ -288,56 +262,36 @@ class DPCoordinatorProc: # 1. Updated request load stats - update our local # state with these. stats = self.engines[eng_index].request_counts - stats_step = scheduler_stats.step_counter - stats_wave = scheduler_stats.current_wave - if (stats_wave > last_stats_wave - or stats_wave == last_stats_wave - and stats_step > last_stats_step): - if stats_changed: - last_step_counts = self._get_engine_counts( - do_copy=True) - last_stats_step = stats_step - last_stats_wave = stats_wave - elif stats_wave != last_stats_wave or ( - stats_step != last_stats_step): - logger.warning( - "Received stats for out-of-order " - "step (%d, %d) from engine %d (expected " - "> (%d, %d))", stats_wave, stats_step, - eng_index, last_stats_wave, last_stats_step) stats[0] = scheduler_stats.num_waiting_reqs stats[1] = scheduler_stats.num_running_reqs - stats_changed = True + self.stats_changed = True if (wave := outputs.wave_complete) is not None: # 2. Notification from rank 0 engine that we've # moved into the global paused state # (engines_running==False). - if current_wave <= wave: + if self.current_wave <= wave: new_wave = wave + 1 logger.debug("Moving DP wave from %d to %d.", - current_wave, new_wave) - current_wave = new_wave - engines_running = False - wave_state_changed = True + self.current_wave, new_wave) + self.current_wave = new_wave + self.engines_running = False + self.stats_changed = True elif (wave := outputs.start_wave) is not None and ( - wave > current_wave or - (wave == current_wave and not engines_running)): + wave > self.current_wave or + (wave == self.current_wave + and not self.engines_running)): # 3. The engine received request for a non-current wave # so we must ensure that other engines progress to the # next wave (race condition handling). logger.debug( "Starting wave %d after notification of " "stale wave request from engine.", wave) - current_wave = wave - engines_running = True - wave_state_changed = True + self.current_wave = wave + self.engines_running = True + self.stats_changed = True self._send_start_wave(publish_back, wave, eng_index) - if wave_state_changed: - message = (None, current_wave, engines_running) - publish_front.send(msgspec.msgpack.encode(message)) - @staticmethod def _send_start_wave(socket: zmq.Socket, wave: int, exclude_engine_index: Optional[int]): @@ -350,8 +304,6 @@ class DPCoordinatorProc: socket.send_multipart( (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded)) - def _get_engine_counts(self, do_copy=False) -> list[list[int]]: + def _get_engine_counts(self) -> list[list[int]]: """Return list of [waiting, running] count lists for each engine.""" - if do_copy: - return [copy.copy(e.request_counts) for e in self.engines] return [e.request_counts for e in self.engines] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 79c47e1028882..f9a6315df8af8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -3,6 +3,7 @@ import os import queue import signal +import sys import threading import time from collections import deque @@ -18,14 +19,15 @@ import zmq from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group +from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.utils import (decorate_logs, make_zmq_socket, - resolve_obj_by_qualname, set_process_title) +from vllm.utils import (make_zmq_socket, resolve_obj_by_qualname, + set_process_title) from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, unify_kv_cache_configs) from vllm.v1.core.sched.interface import SchedulerInterface @@ -461,11 +463,8 @@ class EngineCoreProc(EngineCore): self.has_coordinator = addresses.coordinator_output is not None self.frontend_stats_publish_address = ( addresses.frontend_stats_publish_address) - logger.debug("Has DP Coordinator: %s, stats publish address: %s", - self.has_coordinator, - self.frontend_stats_publish_address) # Only publish request queue stats to coordinator for "internal" - # and "hybrid" LB modes . + # LB mode. self.publish_dp_lb_stats = ( self.has_coordinator and not vllm_config.parallel_config.data_parallel_external_lb) @@ -475,38 +474,25 @@ class EngineCoreProc(EngineCore): super().__init__(vllm_config, executor_class, log_stats, executor_fail_callback) - # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, - # and to overlap some serialization/deserialization with the - # model forward pass. - # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - ready_event = threading.Event() - input_thread = threading.Thread(target=self.process_input_sockets, - args=(addresses.inputs, - addresses.coordinator_input, - identity, ready_event), - daemon=True) - input_thread.start() - - self.output_thread = threading.Thread( - target=self.process_output_sockets, - args=(addresses.outputs, addresses.coordinator_output, - self.engine_index), - daemon=True) - self.output_thread.start() - - # Don't complete handshake until DP coordinator ready message is - # received. - while not ready_event.wait(timeout=10): - if not input_thread.is_alive(): - raise RuntimeError( - "Input socket thread died during startup") - assert addresses.coordinator_input is not None - logger.info("Waiting for READY message from DP Coordinator...") - self.step_fn = (self.step if self.batch_queue is None else self.step_with_batch_queue) + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + threading.Thread(target=self.process_input_sockets, + args=(addresses.inputs, addresses.coordinator_input, + identity), + daemon=True).start() + self.output_thread = threading.Thread( + target=self.process_output_sockets, + args=(addresses.outputs, addresses.coordinator_output, + self.engine_index), + daemon=True) + self.output_thread.start() + @contextmanager def _perform_handshakes( self, @@ -521,10 +507,10 @@ class EngineCoreProc(EngineCore): For DP=1 or offline mode, this is with the colocated front-end process. - For DP>1 with internal load-balancing this is with the shared front-end + For DP>1 with internal loadbalancing this is with the shared front-end process which may reside on a different node. - For DP>1 with external or hybrid load-balancing, two handshakes are + For DP>1 with external or hybrid loadbalancing, two handshakes are performed: - With the rank 0 front-end process which retrieves the DP Coordinator ZMQ addresses and DP process group address. @@ -663,14 +649,12 @@ class EngineCoreProc(EngineCore): "vllm_config"].parallel_config if parallel_config.data_parallel_size > 1 or dp_rank > 0: set_process_title("DPEngineCore", str(dp_rank)) - decorate_logs() # Set data parallel rank for this engine process. parallel_config.data_parallel_rank = dp_rank parallel_config.data_parallel_rank_local = local_dp_rank engine_core = DPEngineCoreProc(*args, **kwargs) else: set_process_title("EngineCore") - decorate_logs() engine_core = EngineCoreProc(*args, **kwargs) engine_core.run_busy_loop() @@ -788,7 +772,7 @@ class EngineCoreProc(EngineCore): def process_input_sockets(self, input_addresses: list[str], coord_input_address: Optional[str], - identity: bytes, ready_event: threading.Event): + identity: bytes): """Input socket IO thread.""" # Msgpack serialization decoding. @@ -825,14 +809,9 @@ class EngineCoreProc(EngineCore): # back to us. input_socket.send(b'') poller.register(input_socket, zmq.POLLIN) - if coord_socket is not None: - # Wait for ready message from coordinator. - assert coord_socket.recv() == b"READY" poller.register(coord_socket, zmq.POLLIN) - ready_event.set() - del ready_event while True: for input_socket, _ in poller.poll(): # (RequestType, RequestData) @@ -926,9 +905,11 @@ class DPEngineCoreProc(EngineCoreProc): log_stats: bool, client_handshake_address: Optional[str] = None, ): + self._decorate_logs() + # Counts forward-passes of the model so that we can synchronize # finished with DP peers every N steps. - self.step_counter = 0 + self.counter = 0 self.current_wave = 0 self.last_counts = (0, 0) @@ -938,6 +919,15 @@ class DPEngineCoreProc(EngineCoreProc): executor_class, log_stats, client_handshake_address, dp_rank) + def _decorate_logs(self): + # Add process-specific prefix to stdout and stderr before + # we initialize the engine. + from multiprocessing import current_process + process_name = current_process().name + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) + def _init_data_parallel(self, vllm_config: VllmConfig): # Configure GPUs and stateless process group for data parallel. @@ -999,9 +989,7 @@ class DPEngineCoreProc(EngineCoreProc): counts = self.scheduler.get_request_counts() if counts != self.last_counts: self.last_counts = counts - stats = SchedulerStats(*counts, - step_counter=self.step_counter, - current_wave=self.current_wave) + stats = SchedulerStats(*counts) self.output_queue.put_nowait( (-1, EngineCoreOutputs(scheduler_stats=stats))) @@ -1043,16 +1031,15 @@ class DPEngineCoreProc(EngineCoreProc): self.output_queue.put_nowait( (client_index, EngineCoreOutputs(wave_complete=self.current_wave))) - # Increment wave count and reset step counter. self.current_wave += 1 - self.step_counter = 0 def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool: # Optimization - only perform finish-sync all-reduce every 32 steps. - self.step_counter += 1 - if self.step_counter % 32 != 0: + self.counter += 1 + if self.counter != 32: return True + self.counter = 0 return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished) @@ -1162,6 +1149,9 @@ class DPEngineCoreActor(DPEngineCoreProc): f"{(local_dp_rank + 1) * world_size}) " f"base value: \"{os.getenv(device_control_env_var)}\"") from e + def _decorate_logs(self): + pass + @contextmanager def _perform_handshakes(self, handshake_address: str, identity: bytes, local_client: bool, vllm_config: VllmConfig, diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4d30bb6b74466..26985df6f62df 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -86,12 +86,11 @@ class EngineCoreClient(ABC): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, - client_count: int = 1, client_index: int = 0, ) -> "MPClient": parallel_config = vllm_config.parallel_config client_args = (vllm_config, executor_class, log_stats, - client_addresses, client_count, client_index) + client_addresses, client_index) if parallel_config.data_parallel_size > 1: if parallel_config.data_parallel_external_lb: # External load balancer - client per DP rank. @@ -728,7 +727,6 @@ class AsyncMPClient(MPClient): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, - client_count: int = 1, client_index: int = 0): super().__init__( asyncio_mode=True, @@ -931,12 +929,11 @@ class DPAsyncMPClient(AsyncMPClient): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, - client_count: int = 1, client_index: int = 0): self.current_wave = 0 super().__init__(vllm_config, executor_class, log_stats, - client_addresses, client_count, client_index) + client_addresses, client_index) # List of [waiting, running] pair per engine. # Used only by DPLBAsyncMPClient subclass. @@ -1032,11 +1029,7 @@ class DPAsyncMPClient(AsyncMPClient): counts, wave, running = msgspec.msgpack.decode(buf) self.current_wave = wave self.engines_running = running - if counts is not None: - sliced_counts = counts[count_slice] - self.lb_engines = sliced_counts - logger.debug("Received counts: %s (%s)", sliced_counts, - count_slice) + self.lb_engines = counts[count_slice] resources.stats_update_task = asyncio.create_task( run_engine_stats_update_task()) @@ -1072,45 +1065,40 @@ class DPLBAsyncMPClient(DPAsyncMPClient): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, - client_count: int = 1, client_index: int = 0): - self.client_count = client_count - # To route aborts to the correct engine. self.reqs_in_flight: dict[str, EngineIdentity] = {} super().__init__(vllm_config, executor_class, log_stats, - client_addresses, client_count, client_index) + client_addresses, client_index) assert len(self.core_engines) > 1 - self.eng_start_index = (len(self.core_engines) * - self.client_index) // client_count - def get_core_engine_for_request( self, request: EngineCoreRequest) -> EngineIdentity: # Engines are in rank order. - current_counts = self.lb_engines if (eng_index := request.data_parallel_rank) is None: - if not current_counts: + if not self.lb_engines: return self.core_engine # TODO use P2C alg for larger DP sizes - num_engines = len(current_counts) - min_score = sys.maxsize + num_engines = len(self.lb_engines) + min_counts = [sys.maxsize, sys.maxsize] eng_index = 0 for i in range(num_engines): # Start from client_index to help with balancing when engines # are empty. - idx = (self.eng_start_index + i) % num_engines - waiting, running = current_counts[idx] - score = waiting * 4 + running - if score < min_score: - min_score = score + idx = (self.client_index + i) % num_engines + counts = self.lb_engines[idx] + if counts < min_counts: + min_counts = counts eng_index = idx - # Increment local waiting count for better balancing between stats - # updates from the coordinator (which happen every 100ms). - current_counts[eng_index][0] += self.client_count + # Adjust local counts for better balancing between stats updates + # from the coordinator (which happen every 100ms). + if min_counts[0]: + min_counts[0] += 1 + else: + min_counts[1] += 1 chosen_engine = self.core_engines[eng_index] # Record which engine is chosen for this request, to handle aborts. diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 224acc47feb27..0f2f404a130ef 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -89,10 +89,6 @@ class Processor: return if not params.allowed_token_ids: raise ValueError("allowed_token_ids is not None and empty!") - if self.tokenizer is None: - # When skip_tokenizer_init=True, we can't validate token IDs - # Skip validation and let the model handle invalid tokens - return tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) vocab_size = len(tokenizer) if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): @@ -287,9 +283,8 @@ class Processor: len(decoder_inputs["prompt_token_ids"])) sampling_params.update_from_generation_config( self.generation_config_fields, eos_token_id) - if self.tokenizer is not None: - sampling_params.update_from_tokenizer( - self.tokenizer.get_lora_tokenizer(lora_request)) + sampling_params.update_from_tokenizer( + self.tokenizer.get_lora_tokenizer(lora_request)) else: pooling_params = params.clone() diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index d90051c3224fd..8270385053852 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -4,6 +4,7 @@ import multiprocessing import os import pickle import signal +import sys import threading import time import traceback @@ -27,11 +28,10 @@ from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.executor.multiproc_worker_utils import ( - set_multiprocessing_worker_envs) + _add_prefix, set_multiprocessing_worker_envs) from vllm.logger import init_logger -from vllm.utils import (decorate_logs, get_distributed_init_method, - get_loopback_ip, get_mp_context, get_open_port, - set_process_title) +from vllm.utils import (get_distributed_init_method, get_loopback_ip, + get_mp_context, get_open_port, set_process_title) from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase @@ -382,11 +382,11 @@ class WorkerProc: pp_str = f"PP{rank // tp_size}" if pp_size > 1 else "" tp_str = f"TP{rank % tp_size}" if tp_size > 1 else "" suffix = f"{pp_str}{'_' if pp_str and tp_str else ''}{tp_str}" - process_name = "VllmWorker" if suffix: set_process_title(suffix, append=True) - process_name = f"{process_name} {suffix}" - decorate_logs(process_name) + pid = os.getpid() + _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid) + _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid) # Initialize MessageQueue for receiving SchedulerOutput self.rpc_broadcast_mq = MessageQueue.create_from_handle( diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 9a80460261e02..1eb10ccb6c493 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -33,10 +33,6 @@ class SchedulerStats: num_running_reqs: int = 0 num_waiting_reqs: int = 0 - # These are used for internal DP load-balancing. - step_counter: int = 0 - current_wave: int = 0 - kv_cache_usage: float = 0.0 prefix_cache_stats: PrefixCacheStats = field( diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index d0175695c1d0f..c74d8c543f76c 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -154,7 +154,6 @@ class APIServerProcessManager: client_config = { "input_address": in_addr, "output_address": out_addr, - "client_count": num_servers, "client_index": i } if stats_update_address is not None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 19a8e161f29f0..1b637c9ee9001 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -51,7 +51,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( - AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, + AttentionMetadataBuilder, CommonAttentionMetadata, make_kv_sharing_fast_prefill_attention_metadata, make_local_attention_virtual_batches, split_attn_metadata) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -3137,22 +3137,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) builders.append(attn_metadata_builder_2) - if self.full_cuda_graph: - if attn_metadata_builder_i.attn_cudagraph_support == \ - AttentionCGSupport.NEVER: - raise ValueError(f"Full CUDAGraph not supported for " - f"{attn_backend_i.__name__}. Turn off " - f"CompilationConfig.full_cuda_graph or use a " - f" different attention backend.") - if attn_metadata_builder_i.attn_cudagraph_support == \ - AttentionCGSupport.PURE_DECODE_ONLY: - # Limit the max cudagraph size to the max number of - # sequences for pure decode only cudagraph backend, - # whose max_query_len is 1. - self.cudagraph_batch_sizes = [ - size for size in self.cudagraph_batch_sizes - if size <= self.scheduler_config.max_num_seqs - ] + if (self.full_cuda_graph + and not attn_metadata_builder_i.full_cudagraph_supported): + raise ValueError( + f"Full CUDAGraph not supported for " + f"{attn_backend_i.__name__}. Turn off CompilationConfig." + f"full_cuda_graph or use a different attention backend.") return attn_backend_i, builders def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 4bc4ece9a0df4..0f46ed223ab88 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -321,16 +321,11 @@ class Worker(WorkerBase): if get_pp_group().is_last_rank: max_num_reqs = min(self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens) - # activate building attn_metadata for this dummy run to avoid - # potential illegal memory access for full cudagraph relay. - attn_cudagraph = self.compilation_config.full_cuda_graph and\ - not self.model_config.enforce_eager # We skip EPLB here since we don't want to record dummy metrics hidden_states, last_hidden_states = \ self.model_runner._dummy_run( num_tokens=max_num_reqs, - capture_attn_cudagraph=attn_cudagraph, skip_eplb=True, ) if self.model_runner.is_pooling_model: